Normalize line endings and whitespace

This commit is contained in:
OpenCV Buildbot
2012-10-17 03:18:30 +04:00
committed by Andrey Kamaev
parent 69020da607
commit 04384a71e4
1516 changed files with 258846 additions and 258162 deletions

View File

@@ -40,123 +40,123 @@
__kernel
void LUT_C1_D0( __global uchar *dst,
__global const uchar *src,
__constant uchar *table,
int rows,
int cols,
int channels,
int whole_rows,
int whole_cols,
int src_offset,
int dst_offset,
int lut_offset,
int src_step,
int dst_step)
__global const uchar *src,
__constant uchar *table,
int rows,
int cols,
int channels,
int whole_rows,
int whole_cols,
int src_offset,
int dst_offset,
int lut_offset,
int src_step,
int dst_step)
{
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
__local uchar l[256];
l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
//mem_fence(CLK_LOCAL_MEM_FENCE);
__local uchar l[256];
l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
//mem_fence(CLK_LOCAL_MEM_FENCE);
//clamp(gidx,mask,cols-1);
gidx = gidx >= cols-4?cols-4:gidx;
gidy = gidy >= rows?rows-1:gidy;
//clamp(gidx,mask,cols-1);
gidx = gidx >= cols-4?cols-4:gidx;
gidy = gidy >= rows?rows-1:gidy;
int src_index = src_offset + mad24(gidy,src_step,gidx);
int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
uchar4 p,q;
barrier(CLK_LOCAL_MEM_FENCE);
p.x = src[src_index];
p.y = src[src_index+1];
p.z = src[src_index+2];
p.w = src[src_index+3];
int src_index = src_offset + mad24(gidy,src_step,gidx);
int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
uchar4 p,q;
barrier(CLK_LOCAL_MEM_FENCE);
p.x = src[src_index];
p.y = src[src_index+1];
p.z = src[src_index+2];
p.w = src[src_index+3];
q.x = l[p.x];
q.y = l[p.y];
q.z = l[p.z];
q.w = l[p.w];
*(__global uchar4*)(dst + dst_index) = q;
q.x = l[p.x];
q.y = l[p.y];
q.z = l[p.z];
q.w = l[p.w];
*(__global uchar4*)(dst + dst_index) = q;
}
__kernel
void LUT2_C1_D0( __global uchar *dst,
__global const uchar *src,
__constant uchar *table,
int rows,
int precols,
int channels,
int whole_rows,
int cols,
int src_offset,
int dst_offset,
int lut_offset,
int src_step,
int dst_step)
__global const uchar *src,
__constant uchar *table,
int rows,
int precols,
int channels,
int whole_rows,
int cols,
int src_offset,
int dst_offset,
int lut_offset,
int src_step,
int dst_step)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
//int lidx = get_local_id(0);
int lidy = get_local_id(1);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
//int lidx = get_local_id(0);
int lidy = get_local_id(1);
__local uchar l[256];
l[lidy] = table[lidy+lut_offset];
//mem_fence(CLK_LOCAL_MEM_FENCE);
__local uchar l[256];
l[lidy] = table[lidy+lut_offset];
//mem_fence(CLK_LOCAL_MEM_FENCE);
//clamp(gidx,mask,cols-1);
gidx = gidx >= precols ? cols+gidx : gidx;
gidy = gidy >= rows?rows-1:gidy;
//clamp(gidx,mask,cols-1);
gidx = gidx >= precols ? cols+gidx : gidx;
gidy = gidy >= rows?rows-1:gidy;
int src_index = src_offset + mad24(gidy,src_step,gidx);
int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
//uchar4 p,q;
barrier(CLK_LOCAL_MEM_FENCE);
uchar p = src[src_index];
uchar q = l[p];
dst[dst_index] = q;
int src_index = src_offset + mad24(gidy,src_step,gidx);
int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
//uchar4 p,q;
barrier(CLK_LOCAL_MEM_FENCE);
uchar p = src[src_index];
uchar q = l[p];
dst[dst_index] = q;
}
__kernel
void LUT_C4_D0( __global uchar4 *dst,
__global uchar4 *src,
__constant uchar *table,
int rows,
int cols,
int channels,
int whole_rows,
int whole_cols,
int src_offset,
int dst_offset,
int lut_offset,
int src_step,
int dst_step)
__global uchar4 *src,
__constant uchar *table,
int rows,
int cols,
int channels,
int whole_rows,
int whole_cols,
int src_offset,
int dst_offset,
int lut_offset,
int src_step,
int dst_step)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int src_index = mad24(gidy,src_step,gidx+src_offset);
int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
__local uchar l[256];
l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
//mem_fence(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if(gidx<cols && gidy<rows)
{
uchar4 p = src[src_index];
uchar4 q;
q.x = l[p.x];
q.y = l[p.y];
q.z = l[p.z];
q.w = l[p.w];
dst[dst_index] = q;
}
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int src_index = mad24(gidy,src_step,gidx+src_offset);
int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
__local uchar l[256];
l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
//mem_fence(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if(gidx<cols && gidy<rows)
{
uchar4 p = src[src_index];
uchar4 q;
q.x = l[p.x];
q.y = l[p.y];
q.z = l[p.z];
q.w = l[p.w];
dst[dst_index] = q;
}
}

View File

@@ -64,28 +64,28 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = abs_diff(src1_data, src2_data);
@@ -112,8 +112,8 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -146,8 +146,8 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -250,20 +250,20 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
@@ -289,7 +289,7 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -320,7 +320,7 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -423,7 +423,7 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -565,7 +565,7 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -575,9 +575,9 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -588,17 +588,17 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
uchar4 tmp_data_2 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_2), src2_data_2));
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -619,7 +619,7 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -643,12 +643,12 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -669,7 +669,7 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -693,12 +693,12 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -716,7 +716,7 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -750,13 +750,13 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = src2.x;
float src2_data_1 = src2.y;
float src2_data_2 = src2.z;
@@ -786,13 +786,13 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = src2.x;
double src2_data_1 = src2.y;
double src2_data_2 = src2.z;

View File

@@ -65,28 +65,28 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
uchar4 tmp_data = convert_uchar4_sat(tmp);
@@ -113,8 +113,8 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -148,8 +148,8 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -253,38 +253,38 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
uchar4 mask_data = vload4(0, mask + mask_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(mask_index < 0)
{
uchar4 tmp;
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
}
uchar4 mask_data = vload4(0, mask + mask_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(mask_index < 0)
{
uchar4 tmp;
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
}
uchar4 data = *((__global uchar4 *)(dst + dst_index));
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
uchar4 tmp_data = convert_uchar4_sat(tmp);
@@ -312,8 +312,8 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -349,8 +349,8 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -395,7 +395,7 @@ __kernel void arithm_add_with_mask_C1_D4 (__global int *src1, int src1_step, i
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = convert_int_sat((long)src_data1 + (long)src_data2);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -425,7 +425,7 @@ __kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, i
float dst_data = *((__global float *)((__global char *)dst + dst_index));
float data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float *)((__global char *)dst + dst_index)) = data;
}
@@ -456,7 +456,7 @@ __kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step,
double dst_data = *((__global double *)((__global char *)dst + dst_index));
double data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double *)((__global char *)dst + dst_index)) = data;
}
@@ -478,8 +478,8 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -525,7 +525,7 @@ __kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step,
int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2);
ushort2 data = convert_ushort2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -555,7 +555,7 @@ __kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, i
int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2);
short2 data = convert_short2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -584,7 +584,7 @@ __kernel void arithm_add_with_mask_C2_D4 (__global int *src1, int src1_step, i
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -613,7 +613,7 @@ __kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, i
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
float2 data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float2 *)((__global char *)dst + dst_index)) = data;
}
@@ -644,7 +644,7 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step,
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
double2 data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double2 *)((__global char *)dst + dst_index)) = data;
}
@@ -665,8 +665,8 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -692,17 +692,17 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2));
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -725,8 +725,8 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -753,12 +753,12 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -781,8 +781,8 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -809,12 +809,12 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -834,8 +834,8 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -878,15 +878,15 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
@@ -924,15 +924,15 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
@@ -981,7 +981,7 @@ __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = convert_uchar4_sat(convert_ushort4_sat(src_data1) + convert_ushort4_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -1010,7 +1010,7 @@ __kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step,
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1039,7 +1039,7 @@ __kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, i
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = convert_short4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1068,7 +1068,7 @@ __kernel void arithm_add_with_mask_C4_D4 (__global int *src1, int src1_step, i
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1097,7 +1097,7 @@ __kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, i
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1128,7 +1128,7 @@ __kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step,
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
double4 data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double4 *)((__global char *)dst + dst_index)) = data;
}

View File

@@ -61,30 +61,30 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data ,src2_data;
uchar4 src1_data ,src2_data;
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
@@ -118,14 +118,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -164,14 +164,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -209,18 +209,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define dst_align ((dst_offset >> bitOfInt) & 3)
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
@@ -257,16 +257,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -305,16 +305,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));

View File

@@ -60,21 +60,21 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
uchar4 data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) + src2_data;
uchar4 tmp_data = convert_uchar4_sat(tmp);
@@ -100,7 +100,7 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -132,7 +132,7 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -234,7 +234,7 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -379,7 +379,7 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -389,9 +389,9 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -402,17 +402,17 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -433,7 +433,7 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -457,12 +457,12 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -483,7 +483,7 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -507,12 +507,12 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -530,7 +530,7 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -564,13 +564,13 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = src2.x;
float src2_data_1 = src2.y;
float src2_data_2 = src2.z;
@@ -600,13 +600,13 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = src2.x;
double src2_data_1 = src2.y;
double src2_data_2 = src2.z;

View File

@@ -62,29 +62,29 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
uchar4 mask_data = vload4(0, mask + mask_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(mask_index < 0)
{
uchar4 tmp;
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
}
uchar4 mask_data = vload4(0, mask + mask_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(mask_index < 0)
{
uchar4 tmp;
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
}
uchar4 data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) + src2_data;
@@ -112,7 +112,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -147,7 +147,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -190,7 +190,7 @@ __kernel void arithm_s_add_with_mask_C1_D4 (__global int *src1, int src1_ste
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = convert_int_sat((long)src_data1 + (long)src_data2);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -218,7 +218,7 @@ __kernel void arithm_s_add_with_mask_C1_D5 (__global float *src1, int src1_s
float dst_data = *((__global float *)((__global char *)dst + dst_index));
float data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float *)((__global char *)dst + dst_index)) = data;
}
@@ -248,7 +248,7 @@ __kernel void arithm_s_add_with_mask_C1_D6 (__global double *src1, int src1_
double dst_data = *((__global double *)((__global char *)dst + dst_index));
double data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double *)((__global char *)dst + dst_index)) = data;
}
@@ -268,7 +268,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -307,12 +307,12 @@ __kernel void arithm_s_add_with_mask_C2_D2 (__global ushort *src1, int src1_st
uchar mask_data = *(mask + mask_index);
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 src_data2 = (int2)(src2.x, src2.y);
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) + src_data2;
ushort2 data = convert_ushort2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -335,12 +335,12 @@ __kernel void arithm_s_add_with_mask_C2_D3 (__global short *src1, int src1_ste
uchar mask_data = *(mask + mask_index);
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 src_data2 = (int2)(src2.x, src2.y);
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) + src_data2;
short2 data = convert_short2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -363,11 +363,11 @@ __kernel void arithm_s_add_with_mask_C2_D4 (__global int *src1, int src1_step,
uchar mask_data = *(mask + mask_index);
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 src_data2 = (int2)(src2.x, src2.y);
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -390,11 +390,11 @@ __kernel void arithm_s_add_with_mask_C2_D5 (__global float *src1, int src1_ste
uchar mask_data = *(mask + mask_index);
float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
float2 src_data2 = (float2)(src2.x, src2.y);
float2 src_data2 = (float2)(src2.x, src2.y);
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
float2 data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float2 *)((__global char *)dst + dst_index)) = data;
}
@@ -419,11 +419,11 @@ __kernel void arithm_s_add_with_mask_C2_D6 (__global double *src1, int src1_st
uchar mask_data = *(mask + mask_index);
double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
double2 src_data2 = (double2)(src2.x, src2.y);
double2 src_data2 = (double2)(src2.x, src2.y);
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
double2 data = src_data1 + src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double2 *)((__global char *)dst + dst_index)) = data;
}
@@ -444,7 +444,7 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -470,17 +470,17 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -502,7 +502,7 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -513,9 +513,9 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
uchar2 mask_data = vload2(0, mask + mask_index);
@@ -529,12 +529,12 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -556,7 +556,7 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -567,9 +567,9 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
uchar2 mask_data = vload2(0, mask + mask_index);
@@ -583,12 +583,12 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -607,7 +607,7 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -615,9 +615,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = src2.x;
int src2_data_0 = src2.x;
int src2_data_1 = src2.y;
int src2_data_2 = src2.z;
int src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
@@ -649,17 +649,17 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = src2.x;
float src2_data_0 = src2.x;
float src2_data_1 = src2.y;
float src2_data_2 = src2.z;
float src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
@@ -693,17 +693,17 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = src2.x;
double src2_data_0 = src2.x;
double src2_data_1 = src2.y;
double src2_data_2 = src2.z;
double src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
@@ -747,7 +747,7 @@ __kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_ste
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = convert_uchar4_sat(convert_int4_sat(src_data1) + src2);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -773,7 +773,7 @@ __kernel void arithm_s_add_with_mask_C4_D2 (__global ushort *src1, int src1_st
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + src2);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -799,7 +799,7 @@ __kernel void arithm_s_add_with_mask_C4_D3 (__global short *src1, int src1_ste
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = convert_short4_sat(convert_int4_sat(src_data1) + src2);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -825,7 +825,7 @@ __kernel void arithm_s_add_with_mask_C4_D4 (__global int *src1, int src1_step,
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -851,7 +851,7 @@ __kernel void arithm_s_add_with_mask_C4_D5 (__global float *src1, int src1_ste
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 data = src_data1 + src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float4 *)((__global char *)dst + dst_index)) = data;
}
@@ -879,7 +879,7 @@ __kernel void arithm_s_add_with_mask_C4_D6 (__global double *src1, int src1_st
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
double4 data = src_data1 + src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double4 *)((__global char *)dst + dst_index)) = data;
}

View File

@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);

View File

@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data & src2_data;
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data & src2_data;
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
char4 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
ushort2 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
short2 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
char4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)(dst + dst_index)) = data;
}
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
char8 data_2 = src_data1_2 & src_data2_2;
char8 data_3 = src_data1_3 & src_data2_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
*((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;

View File

@@ -64,7 +64,7 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step,
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -98,7 +98,7 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step,
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -132,7 +132,7 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -163,7 +163,7 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, i
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 tmp_data = src1_data & src2_data;
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
@@ -269,7 +269,7 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -280,7 +280,7 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step,
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
@@ -303,7 +303,7 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -311,10 +311,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step,
char4 src1_data = vload4(0, src1 + src1_index);
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
@@ -339,7 +339,7 @@ __kernel void arithm_s_bitwise_and_C2_D2 (__global ushort *src1, int src1_step
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
ushort2 data = src_data1 & src_data2;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_and_C2_D3 (__global short *src1, int src1_step,
short2 src_data2 = (short2)(src2.x, src2.y);
short2 data = src_data1 & src_data2;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
@@ -401,7 +401,7 @@ __kernel void arithm_s_bitwise_and_C2_D5 (__global char *src1, int src1_step,
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
char8 tmp_data = src1_data & src2_data;
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
@@ -423,7 +423,7 @@ __kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, i
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
short8 tmp_data = src1_data & src2_data;
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
@@ -441,7 +441,7 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -451,9 +451,9 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -462,19 +462,19 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -497,7 +497,7 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -507,9 +507,9 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
@@ -520,17 +520,17 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2));
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -552,7 +552,7 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -576,12 +576,12 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -602,7 +602,7 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step,
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -626,12 +626,12 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step,
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_and_C3_D4 (__global int *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -683,16 +683,16 @@ __kernel void arithm_s_bitwise_and_C3_D5 (__global char *src1, int src1_step,
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
@@ -718,13 +718,13 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
@@ -736,7 +736,7 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
short4 tmp_data_0 = src1_data_0 & src2_data_0;
short4 tmp_data_1 = src1_data_1 & src2_data_1;
short4 tmp_data_2 = src1_data_2 & src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
@@ -864,7 +864,7 @@ __kernel void arithm_s_bitwise_and_C4_D5 (__global char *src1, int src1_step,
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
char16 tmp_data = src1_data & src2_data;
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
@@ -891,17 +891,17 @@ __kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, i
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 tmp_data_0 = src1_data_0 & src2_data_0;
short4 tmp_data_1 = src1_data_1 & src2_data_1;
short4 tmp_data_2 = src1_data_2 & src2_data_2;
short4 tmp_data_3 = src1_data_3 & src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
}
}
#endif

View File

@@ -66,7 +66,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int s
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -141,7 +141,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -154,7 +154,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
ushort2 tmp_data = src1_data & src2_data;
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
@@ -175,7 +175,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -217,7 +217,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global int *src1, int
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -245,7 +245,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
char4 data = src1_data & src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
@@ -274,7 +274,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int sr
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src1_data & src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -330,7 +330,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int s
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -373,7 +373,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
ushort2 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -400,7 +400,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global short *src1, int
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
short2 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -427,7 +427,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global int *src1, int sr
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = src_data1 & src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -454,7 +454,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global char *src1, int s
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src1_data & src2_data;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int sr
short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index));
short8 data = src1_data & src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short8 *)((__global char *)dst + dst_index)) = data;
}
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -514,9 +514,9 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
@@ -529,17 +529,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -563,7 +563,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -574,9 +574,9 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
@@ -587,19 +587,19 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
char4 tmp_data_0 = src1_data_0 & src2_data_0;
char4 tmp_data_1 = src1_data_1 & src2_data_1;
char4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -622,7 +622,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -646,15 +646,15 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int
ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -703,12 +703,12 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -727,7 +727,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global int *src1, int sr
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -769,18 +769,18 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global char *src1, int s
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
uchar mask_data = * (mask + mask_index);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
@@ -812,18 +812,18 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
uchar mask_data = * (mask + mask_index);
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
@@ -833,7 +833,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
short4 tmp_data_0 = src1_data_0 & src2_data_0;
short4 tmp_data_1 = src1_data_1 & src2_data_1;
short4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = src_data1 & src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -893,7 +893,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global char *src1, int s
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 data = src_data1 & src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)(dst + dst_index)) = data;
}
@@ -920,7 +920,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = src_data1 & src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -946,7 +946,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global short *src1, int
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src_data1 & src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -972,7 +972,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global int *src1, int sr
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = src_data1 & src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1000,7 +1000,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global char *src1, int s
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src1_data & src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -1032,7 +1032,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0));
short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8));
short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
@@ -1042,10 +1042,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
short4 data_1 = src1_data_1 & src2_data_1;
short4 data_2 = src1_data_2 & src2_data_2;
short4 data_3 = src1_data_3 & src2_data_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;

View File

@@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -95,7 +95,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -129,7 +129,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -164,7 +164,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -238,12 +238,12 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
{
int src_index = mad24(y, src_step, (x << 3) + src_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
char8 data;
data = *((__global char8 *)((__global char *)src + src_index));
data = ~ data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
}

View File

@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);

View File

@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data | src2_data;
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data | src2_data;
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_s
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
char4 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_s
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
ushort2 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
short2 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_s
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_s
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
uchar4 tmp_data_2 = src1_data_2 | src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
char4 tmp_data_2 = src1_data_2 | src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_s
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_s
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_s
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)(dst + dst_index)) = data;
}
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_s
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
char8 data_2 = src_data1_2 | src_data2_2;
char8 data_3 = src_data1_3 | src_data2_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
*((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;

View File

@@ -62,7 +62,7 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step,
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -96,7 +96,7 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, i
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -130,7 +130,7 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -161,7 +161,7 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -225,7 +225,7 @@ __kernel void arithm_s_bitwise_or_C1_D5 (__global char *src1, int src1_step, i
__kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, in
__kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -260,7 +260,7 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -283,7 +283,7 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -316,7 +316,7 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i
__kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -338,7 +338,7 @@ __kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -381,7 +381,7 @@ __kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, in
__kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -403,7 +403,7 @@ __kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, i
__kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -426,7 +426,7 @@ __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, in
__kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -437,7 +437,7 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -447,9 +447,9 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -460,17 +460,17 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -494,7 +494,7 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -504,9 +504,9 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
@@ -517,17 +517,17 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
char4 tmp_data_2 = src1_data_2 | src2_data_2;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -539,7 +539,7 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
__kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -550,7 +550,7 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -574,12 +574,12 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -590,7 +590,7 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -601,7 +601,7 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -625,12 +625,12 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -641,7 +641,7 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, in
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, in
__kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -685,16 +685,16 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 tmp_data_0 = src1_data_0 | src2_data_0;
char4 tmp_data_1 = src1_data_1 | src2_data_1;
@@ -709,7 +709,7 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, i
__kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -717,13 +717,13 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
@@ -735,7 +735,7 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
short4 tmp_data_0 = src1_data_0 | src2_data_0;
short4 tmp_data_1 = src1_data_1 | src2_data_1;
short4 tmp_data_2 = src1_data_2 | src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
@@ -745,7 +745,7 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
__kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -768,7 +768,7 @@ __kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -790,7 +790,7 @@ __kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, i
__kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -811,7 +811,7 @@ __kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -832,7 +832,7 @@ __kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step,
__kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -853,7 +853,7 @@ __kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, in
__kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -877,7 +877,7 @@ __kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, i
__kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -897,17 +897,17 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 tmp_data_0 = src1_data_0 | src2_data_0;
short4 tmp_data_1 = src1_data_1 | src2_data_1;
short4 tmp_data_2 = src1_data_2 | src2_data_2;
short4 tmp_data_3 = src1_data_3 | src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
}
}
#endif

View File

@@ -54,7 +54,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -65,7 +65,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -93,7 +93,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -131,7 +131,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -142,7 +142,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -166,7 +166,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -177,7 +177,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -201,7 +201,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int s
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -220,7 +220,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int s
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -230,7 +230,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
char4 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
@@ -260,7 +260,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -279,7 +279,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src1_data | src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -289,7 +289,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -300,7 +300,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -326,7 +326,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -337,7 +337,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -362,7 +362,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -381,7 +381,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
ushort2 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -390,7 +390,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int s
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -409,7 +409,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int s
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
short2 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -418,7 +418,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -437,7 +437,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -446,7 +446,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int sr
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -463,8 +463,8 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int sr
char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
char8 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
@@ -474,7 +474,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int sr
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -493,7 +493,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int sr
short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index));
short8 data = src1_data | src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short8 *)((__global char *)dst + dst_index)) = data;
}
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -514,7 +514,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -525,9 +525,9 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
@@ -540,17 +540,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
uchar4 tmp_data_2 = src1_data_2 | src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -564,7 +564,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -575,7 +575,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -586,9 +586,9 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
@@ -601,17 +601,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
char4 tmp_data_2 = src1_data_2 | src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -624,7 +624,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -635,7 +635,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -662,12 +662,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -679,7 +679,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -690,7 +690,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -717,12 +717,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -734,7 +734,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -742,7 +742,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -777,7 +777,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int sr
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -785,18 +785,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int sr
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
uchar mask_data = * (mask + mask_index);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
@@ -829,18 +829,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
uchar mask_data = * (mask + mask_index);
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
@@ -850,7 +850,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
short4 tmp_data_0 = src1_data_0 | src2_data_0;
short4 tmp_data_1 = src1_data_1 | src2_data_1;
short4 tmp_data_2 = src1_data_2 | src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int s
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -883,7 +883,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int s
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = src_data1 | src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -894,7 +894,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int sr
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -912,7 +912,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int sr
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 data = src_data1 | src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)(dst + dst_index)) = data;
}
@@ -922,7 +922,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -940,7 +940,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = src_data1 | src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -949,7 +949,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int s
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -967,7 +967,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int s
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src_data1 | src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -976,7 +976,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -994,7 +994,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = src_data1 | src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1003,7 +1003,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int sr
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@@ -1023,7 +1023,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int sr
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -1055,7 +1055,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0));
short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8));
short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
@@ -1065,10 +1065,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
short4 data_1 = src1_data_1 | src2_data_1;
short4 data_2 = src1_data_2 | src2_data_2;
short4 data_3 = src1_data_3 | src2_data_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;

View File

@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);

View File

@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data ^ src2_data;
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data ^ src2_data;
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
char4 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
ushort2 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
short2 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)(dst + dst_index)) = data;
}
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
char8 data_2 = src_data1_2 ^ src_data2_2;
char8 data_3 = src_data1_3 ^ src_data2_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
*((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;

View File

@@ -64,7 +64,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step,
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -98,7 +98,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step,
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -132,7 +132,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -163,7 +163,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, i
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 tmp_data = src1_data ^ src2_data;
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
@@ -269,7 +269,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -280,7 +280,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step,
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data ^ src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
@@ -303,7 +303,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -311,10 +311,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step,
char4 src1_data = vload4(0, src1 + src1_index);
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data ^ src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
@@ -339,7 +339,7 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (__global ushort *src1, int src1_step
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
ushort2 data = src_data1 ^ src_data2;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (__global short *src1, int src1_step,
short2 src_data2 = (short2)(src2.x, src2.y);
short2 data = src_data1 ^ src_data2;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
@@ -401,7 +401,7 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (__global char *src1, int src1_step,
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
char8 tmp_data = src1_data ^ src2_data;
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
@@ -423,7 +423,7 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, i
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
short8 tmp_data = src1_data ^ src2_data;
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
@@ -441,7 +441,7 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -451,9 +451,9 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -462,19 +462,19 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -497,7 +497,7 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -507,9 +507,9 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
@@ -520,17 +520,17 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -552,7 +552,7 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -576,12 +576,12 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -602,7 +602,7 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step,
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -626,12 +626,12 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step,
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_xor_C3_D4 (__global int *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -683,16 +683,16 @@ __kernel void arithm_s_bitwise_xor_C3_D5 (__global char *src1, int src1_step,
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
@@ -718,13 +718,13 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
@@ -736,7 +736,7 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
@@ -864,7 +864,7 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (__global char *src1, int src1_step,
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
char16 tmp_data = src1_data ^ src2_data;
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
@@ -891,17 +891,17 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, i
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
}
}
#endif

View File

@@ -66,7 +66,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int s
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -141,7 +141,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -154,7 +154,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
ushort2 tmp_data = src1_data ^ src2_data;
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
@@ -175,7 +175,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -217,7 +217,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global int *src1, int
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -245,7 +245,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
char4 data = src1_data ^ src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
@@ -274,7 +274,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int sr
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src1_data ^ src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -330,7 +330,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int s
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -373,7 +373,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
ushort2 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -400,7 +400,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global short *src1, int
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
short2 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -427,7 +427,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global int *src1, int sr
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = src_data1 ^ src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -454,7 +454,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global char *src1, int s
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src1_data ^ src2_data;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int sr
short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index));
short8 data = src1_data ^ src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short8 *)((__global char *)dst + dst_index)) = data;
}
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -514,9 +514,9 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
@@ -529,17 +529,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -563,7 +563,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -574,9 +574,9 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
@@ -587,19 +587,19 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
@@ -622,7 +622,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -646,15 +646,15 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int
ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0;
ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1;
ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -703,12 +703,12 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -727,7 +727,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global int *src1, int sr
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -769,18 +769,18 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global char *src1, int s
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
uchar mask_data = * (mask + mask_index);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
@@ -812,18 +812,18 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
uchar mask_data = * (mask + mask_index);
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
@@ -833,7 +833,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = src_data1 ^ src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -893,7 +893,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global char *src1, int s
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 data = src_data1 ^ src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char4 *)(dst + dst_index)) = data;
}
@@ -920,7 +920,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = src_data1 ^ src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -946,7 +946,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global short *src1, int
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src_data1 ^ src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -972,7 +972,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global int *src1, int sr
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = src_data1 ^ src2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1000,7 +1000,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global char *src1, int s
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src1_data ^ src2_data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
@@ -1032,7 +1032,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0));
short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8));
short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
@@ -1042,10 +1042,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
short4 data_1 = src1_data_1 ^ src2_data_1;
short4 data_2 = src1_data_2 ^ src2_data_2;
short4 data_3 = src1_data_3 ^ src2_data_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;
data_2 = mask_data ? data_2 : dst_data_2;
data_3 = mask_data ? data_3 : dst_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;

View File

@@ -63,8 +63,8 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -137,8 +137,8 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -170,11 +170,11 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
int y = get_global_id(1);
if (x < cols && y < rows)
{
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -206,8 +206,8 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -240,8 +240,8 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -276,8 +276,8 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -312,8 +312,8 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -350,8 +350,8 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -384,8 +384,8 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -417,8 +417,8 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -451,8 +451,8 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -487,8 +487,8 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -525,8 +525,8 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -563,8 +563,8 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -598,8 +598,8 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -632,8 +632,8 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -667,8 +667,8 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 3)& 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);

View File

@@ -59,8 +59,8 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -97,8 +97,8 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -135,8 +135,8 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -169,8 +169,8 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -202,8 +202,8 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -236,8 +236,8 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -258,7 +258,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
}
#endif
/***********************************Compare LT*******************************/
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
@@ -273,8 +273,8 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -311,8 +311,8 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -349,8 +349,8 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -383,8 +383,8 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -416,8 +416,8 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -450,8 +450,8 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -486,8 +486,8 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -524,8 +524,8 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -562,8 +562,8 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -596,8 +596,8 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -629,8 +629,8 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -663,8 +663,8 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 3)& 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);

View File

@@ -49,7 +49,7 @@ typedef double F ;
typedef double4 F4;
#define convert_F4 convert_double4
#define convert_F double
#else
#else
typedef float F;
typedef float4 F4;
#define convert_F4 convert_float4
@@ -102,8 +102,8 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -143,8 +143,8 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -183,8 +183,8 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -298,7 +298,7 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
x = x << 2;
#define dst_align (dst_offset & 3)
int src_index = mad24(y, src_step, x + src_offset - dst_align);
int src_index = mad24(y, src_step, x + src_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -334,7 +334,7 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -369,7 +369,7 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);

View File

@@ -84,7 +84,7 @@ __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int sr
double src_data = *((__global double *)((__global char *)src + srcIdx));
double dst_data = exp(src_data);
*((__global double *)((__global char *)dst + dstIdx )) = dst_data;
// dst[dstIdx] = exp(src[srcIdx]);
}

View File

@@ -48,7 +48,7 @@
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////flip rows///////////////////////////////////////////////
/////////////////////////////////////////////flip rows///////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_offset,
__global uchar *dst, int dst_step, int dst_offset,
@@ -62,8 +62,8 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
x = x << 2;
#define dst_align (dst_offset & 3)
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
int dst_start_0 = mad24(y, dst_step, dst_offset);
int dst_start_1 = mad24(rows - y - 1, dst_step, dst_offset);
@@ -71,22 +71,22 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
int dst_end_1 = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
int dst_index_0 = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int dst_index_1 = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
uchar4 src_data_0 = vload4(0, src + src1_index_fix);
uchar4 src_data_1 = vload4(0, src + src2_index_fix);
if(src_index_0 < 0)
{
uchar4 tmp;
tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
}
if(src_index_1 < 0)
{
uchar4 tmp;
tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
}
if(src_index_0 < 0)
{
uchar4 tmp;
tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
}
if(src_index_1 < 0)
{
uchar4 tmp;
tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
}
uchar4 dst_data_0 = *((__global uchar4 *)(dst + dst_index_0));
uchar4 dst_data_1 = *((__global uchar4 *)(dst + dst_index_1));
@@ -117,8 +117,8 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
x = x << 2;
#define dst_align (dst_offset & 3)
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
int dst_start_0 = mad24(y, dst_step, dst_offset);
int dst_start_1 = mad24(rows - y - 1, dst_step, dst_offset);
@@ -159,8 +159,8 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
x = x << 2;
#define dst_align (((dst_offset >> 1) & 3) << 1)
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
int dst_start_0 = mad24(y, dst_step, dst_offset);
int dst_start_1 = mad24(rows - y - 1, dst_step, dst_offset);
@@ -201,8 +201,8 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
x = x << 2;
#define dst_align (((dst_offset >> 1) & 3) << 1)
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
int dst_start_0 = mad24(y, dst_step, dst_offset);
int dst_start_1 = mad24(rows - y - 1, dst_step, dst_offset);
@@ -243,7 +243,7 @@ __kernel void arithm_flip_rows_D4 (__global int *src, int src_step, int src_offs
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);
@@ -265,7 +265,7 @@ __kernel void arithm_flip_rows_D5 (__global float *src, int src_step, int src_of
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);
@@ -289,7 +289,7 @@ __kernel void arithm_flip_rows_D6 (__global double *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 3) + dst_offset);
@@ -302,7 +302,7 @@ __kernel void arithm_flip_rows_D6 (__global double *src, int src_step, int src_o
}
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////flip cols///////////////////////////////////////////////
/////////////////////////////////////////////flip cols///////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src_offset,
__global uchar *dst, int dst_step, int dst_offset,
@@ -315,7 +315,7 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x) + src_offset);
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
@@ -337,7 +337,7 @@ __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x) + src_offset);
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
@@ -359,7 +359,7 @@ __kernel void arithm_flip_cols_C1_D2 (__global ushort *src, int src_step, int sr
{
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
@@ -381,7 +381,7 @@ __kernel void arithm_flip_cols_C1_D3 (__global short *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
@@ -403,7 +403,7 @@ __kernel void arithm_flip_cols_C1_D4 (__global int *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -425,7 +425,7 @@ __kernel void arithm_flip_cols_C1_D5 (__global float *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -449,7 +449,7 @@ __kernel void arithm_flip_cols_C1_D6 (__global double *src, int src_step, int sr
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -472,7 +472,7 @@ __kernel void arithm_flip_cols_C2_D0 (__global uchar *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
@@ -494,7 +494,7 @@ __kernel void arithm_flip_cols_C2_D1 (__global char *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
@@ -516,7 +516,7 @@ __kernel void arithm_flip_cols_C2_D2 (__global ushort *src, int src_step, int sr
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -538,7 +538,7 @@ __kernel void arithm_flip_cols_C2_D3 (__global short *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -560,7 +560,7 @@ __kernel void arithm_flip_cols_C2_D4 (__global int *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -582,7 +582,7 @@ __kernel void arithm_flip_cols_C2_D5 (__global float *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -606,7 +606,7 @@ __kernel void arithm_flip_cols_C2_D6 (__global double *src, int src_step, int sr
{
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
@@ -630,7 +630,7 @@ __kernel void arithm_flip_cols_C3_D0 (__global uchar *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x) * 3 + src_offset);
int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) * 3 + dst_offset);
int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);
@@ -662,7 +662,7 @@ __kernel void arithm_flip_cols_C3_D1 (__global char *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x) * 3 + src_offset);
int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) * 3 + dst_offset);
int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);
@@ -694,7 +694,7 @@ __kernel void arithm_flip_cols_C3_D2 (__global ushort *src, int src_step, int sr
{
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
@@ -726,7 +726,7 @@ __kernel void arithm_flip_cols_C3_D3 (__global short *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
@@ -758,14 +758,14 @@ __kernel void arithm_flip_cols_C3_D4 (__global int *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
@@ -773,7 +773,7 @@ __kernel void arithm_flip_cols_C3_D4 (__global int *src, int src_step, int src_o
*((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
*((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
*((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
*((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
*((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
*((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
@@ -790,14 +790,14 @@ __kernel void arithm_flip_cols_C3_D5 (__global float *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
@@ -805,7 +805,7 @@ __kernel void arithm_flip_cols_C3_D5 (__global float *src, int src_step, int src
*((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
*((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
*((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
*((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
*((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
*((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
@@ -824,14 +824,14 @@ __kernel void arithm_flip_cols_C3_D6 (__global double *src, int src_step, int sr
{
int src_index_0 = mad24(y, src_step, (x * 3 << 3) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 3) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0));
double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8));
double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0));
double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8));
double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
@@ -839,7 +839,7 @@ __kernel void arithm_flip_cols_C3_D6 (__global double *src, int src_step, int sr
*((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
*((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
*((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
*((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
*((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
*((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
@@ -857,7 +857,7 @@ __kernel void arithm_flip_cols_C4_D0 (__global uchar *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -879,7 +879,7 @@ __kernel void arithm_flip_cols_C4_D1 (__global char *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -901,7 +901,7 @@ __kernel void arithm_flip_cols_C4_D2 (__global ushort *src, int src_step, int sr
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -923,7 +923,7 @@ __kernel void arithm_flip_cols_C4_D3 (__global short *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -946,7 +946,7 @@ __kernel void arithm_flip_cols_C4_D4 (__global int *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
@@ -968,7 +968,7 @@ __kernel void arithm_flip_cols_C4_D5 (__global float *src, int src_step, int src
{
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
@@ -991,7 +991,7 @@ __kernel void arithm_flip_cols_C4_D6 (__global double *src, int src_step, int sr
{
int src_index_0 = mad24(y, src_step, (x << 5) + src_offset);
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 5) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 5) + dst_offset);
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 5) + dst_offset);

View File

@@ -60,7 +60,7 @@ __kernel void arithm_flip_rc_C1_D0 (__global uchar *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
@@ -82,7 +82,7 @@ __kernel void arithm_flip_rc_C1_D1 (__global char *src, int src_step, int src_of
{
int src_index_0 = mad24(y, src_step, (x) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
@@ -104,7 +104,7 @@ __kernel void arithm_flip_rc_C1_D2 (__global ushort *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
@@ -126,7 +126,7 @@ __kernel void arithm_flip_rc_C1_D3 (__global short *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
@@ -148,7 +148,7 @@ __kernel void arithm_flip_rc_C1_D4 (__global int *src, int src_step, int src_off
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -170,7 +170,7 @@ __kernel void arithm_flip_rc_C1_D5 (__global float *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -194,7 +194,7 @@ __kernel void arithm_flip_rc_C1_D6 (__global double *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -217,7 +217,7 @@ __kernel void arithm_flip_rc_C2_D0 (__global uchar *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
@@ -239,7 +239,7 @@ __kernel void arithm_flip_rc_C2_D1 (__global char *src, int src_step, int src_of
{
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
@@ -261,7 +261,7 @@ __kernel void arithm_flip_rc_C2_D2 (__global ushort *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -283,7 +283,7 @@ __kernel void arithm_flip_rc_C2_D3 (__global short *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -305,7 +305,7 @@ __kernel void arithm_flip_rc_C2_D4 (__global int *src, int src_step, int src_off
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -327,7 +327,7 @@ __kernel void arithm_flip_rc_C2_D5 (__global float *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -351,7 +351,7 @@ __kernel void arithm_flip_rc_C2_D6 (__global double *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
@@ -375,7 +375,7 @@ __kernel void arithm_flip_rc_C3_D0 (__global uchar *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x * 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3 + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
@@ -408,7 +408,7 @@ __kernel void arithm_flip_rc_C3_D1 (__global char *src, int src_step, int src_of
{
int src_index_0 = mad24(y, src_step, (x * 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3 + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
@@ -441,7 +441,7 @@ __kernel void arithm_flip_rc_C3_D2 (__global ushort *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
@@ -473,7 +473,7 @@ __kernel void arithm_flip_rc_C3_D3 (__global short *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
@@ -506,14 +506,14 @@ __kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_off
{
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
@@ -521,7 +521,7 @@ __kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_off
*((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
*((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
*((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
*((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
*((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
*((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
@@ -538,14 +538,14 @@ __kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
@@ -553,7 +553,7 @@ __kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_o
*((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
*((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
*((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
*((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
*((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
*((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
@@ -572,14 +572,14 @@ __kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x * 3 << 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x * 3 << 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0 ));
double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8 ));
double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0 ));
double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8 ));
double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
@@ -587,7 +587,7 @@ __kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_
*((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
*((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
*((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
*((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
*((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
*((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
@@ -605,7 +605,7 @@ __kernel void arithm_flip_rc_C4_D0 (__global uchar *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -627,7 +627,7 @@ __kernel void arithm_flip_rc_C4_D1 (__global char *src, int src_step, int src_of
{
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
@@ -649,7 +649,7 @@ __kernel void arithm_flip_rc_C4_D2 (__global ushort *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -671,7 +671,7 @@ __kernel void arithm_flip_rc_C4_D3 (__global short *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
@@ -693,7 +693,7 @@ __kernel void arithm_flip_rc_C4_D4 (__global int *src, int src_step, int src_off
{
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
@@ -715,7 +715,7 @@ __kernel void arithm_flip_rc_C4_D5 (__global float *src, int src_step, int src_o
{
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
@@ -739,7 +739,7 @@ __kernel void arithm_flip_rc_C4_D6 (__global double *src, int src_step, int src_
{
int src_index_0 = mad24(y, src_step, (x << 5) + src_offset);
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 5) + src_offset);
int dst_index_0 = mad24(y, dst_step, (x << 5) + dst_offset);
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 5) + dst_offset);

View File

@@ -48,7 +48,7 @@
#endif
#define INF_FLOAT -88.029694
#define INF_DOUBLE -709.0895657128241
#define INF_DOUBLE -709.0895657128241
//////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -112,16 +112,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));

View File

@@ -57,37 +57,37 @@
#if defined (DEPTH_1)
#define VEC_TYPE char8
#define CONVERT_TYPE convert_char8
#define MIN_VAL -128
#define MIN_VAL -128
#define MAX_VAL 127
#endif
#if defined (DEPTH_2)
#define VEC_TYPE ushort8
#define CONVERT_TYPE convert_ushort8
#define MIN_VAL 0
#define MIN_VAL 0
#define MAX_VAL 65535
#endif
#if defined (DEPTH_3)
#define VEC_TYPE short8
#define CONVERT_TYPE convert_short8
#define MIN_VAL -32768
#define MIN_VAL -32768
#define MAX_VAL 32767
#endif
#if defined (DEPTH_4)
#define VEC_TYPE int8
#define CONVERT_TYPE convert_int8
#define MIN_VAL INT_MIN
#define MIN_VAL INT_MIN
#define MAX_VAL INT_MAX
#endif
#if defined (DEPTH_5)
#define VEC_TYPE float8
#define CONVERT_TYPE convert_float8
#define MIN_VAL (-FLT_MAX)
#define MIN_VAL (-FLT_MAX)
#define MAX_VAL FLT_MAX
#endif
#if defined (DEPTH_6)
#define VEC_TYPE double8
#define CONVERT_TYPE convert_double8
#define MIN_VAL (-DBL_MAX)
#define MIN_VAL (-DBL_MAX)
#define MAX_VAL DBL_MAX
#endif
@@ -157,7 +157,7 @@ __kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum
if(id < elemnum)
{
temp = src[idx];
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp);
}
@@ -177,7 +177,7 @@ __kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum
{
idx = offset + id + (id / cols) * invalid_cols;
temp = src[idx];
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp);
}

View File

@@ -66,7 +66,7 @@
#define VEC_TYPE_LOC int4
#define CONVERT_TYPE convert_char4
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
#define MIN_VAL -128
#define MIN_VAL -128
#define MAX_VAL 127
#endif
#if defined (DEPTH_2)
@@ -74,7 +74,7 @@
#define VEC_TYPE_LOC int4
#define CONVERT_TYPE convert_ushort4
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
#define MIN_VAL 0
#define MIN_VAL 0
#define MAX_VAL 65535
#endif
#if defined (DEPTH_3)
@@ -82,7 +82,7 @@
#define VEC_TYPE_LOC int4
#define CONVERT_TYPE convert_short4
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
#define MIN_VAL -32768
#define MIN_VAL -32768
#define MAX_VAL 32767
#endif
#if defined (DEPTH_4)
@@ -90,7 +90,7 @@
#define VEC_TYPE_LOC int4
#define CONVERT_TYPE convert_int4
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
#define MIN_VAL INT_MIN
#define MIN_VAL INT_MIN
#define MAX_VAL INT_MAX
#endif
#if defined (DEPTH_5)
@@ -98,7 +98,7 @@
#define VEC_TYPE_LOC float4
#define CONVERT_TYPE convert_float4
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
#define MIN_VAL (-FLT_MAX)
#define MIN_VAL (-FLT_MAX)
#define MAX_VAL FLT_MAX
#endif
#if defined (DEPTH_6)
@@ -106,12 +106,12 @@
#define VEC_TYPE_LOC double4
#define CONVERT_TYPE convert_double4
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
#define MIN_VAL (-DBL_MAX)
#define MIN_VAL (-DBL_MAX)
#define MAX_VAL DBL_MAX
#endif
#if defined (REPEAT_S0)
#define repeat_s(a) a=a;
#define repeat_s(a) a=a;
#endif
#if defined (REPEAT_S1)
#define repeat_s(a) a.s0 = a.s1;
@@ -125,7 +125,7 @@
#if defined (REPEAT_E0)
#define repeat_e(a) a=a;
#define repeat_e(a) a=a;
#endif
#if defined (REPEAT_E1)
#define repeat_e(a) a.s3 = a.s2;
@@ -159,7 +159,7 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
temp = src[idx];
idx_c = idx << 2;
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp);
repeat_s(temploc);
@@ -188,7 +188,7 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
temp = src[idx];
idx_c = idx << 2;
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp);
repeat_s(temploc);
@@ -228,9 +228,9 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
int lid2 = lsize + lid;
localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
localmem_minloc[lid] =
localmem_minloc[lid] =
CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
localmem_maxloc[lid] =
localmem_maxloc[lid] =
CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -291,7 +291,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
m_temp = CONVERT_TYPE(mask[midx]);
int idx_c = idx << 2;
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_ms(m_temp);
repeat_s(temploc);
@@ -321,7 +321,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
m_temp = CONVERT_TYPE(mask[midx]);
int idx_c = idx << 2;
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_ms(m_temp);
repeat_s(temploc);
@@ -333,7 +333,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
}
minval = min(minval,m_temp > zero ? temp : max_val);
maxval = max(maxval,m_temp > zero ? temp : min_val);
temploc = CONDITION_FUNC(m_temp > zero, temploc , negative);
minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
@@ -361,9 +361,9 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
int lid2 = lsize + lid;
localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
localmem_minloc[lid] =
localmem_minloc[lid] =
CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
localmem_maxloc[lid] =
localmem_maxloc[lid] =
CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);

View File

@@ -68,7 +68,7 @@
#define VEC_TYPE_LOC int4
#define CONVERT_TYPE convert_char4
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
#define MIN_VAL -128
#define MIN_VAL -128
#define MAX_VAL 127
#endif
#if defined (DEPTH_2)
@@ -77,7 +77,7 @@
#define VEC_TYPE_LOC int4
#define CONVERT_TYPE convert_ushort4
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
#define MIN_VAL 0
#define MIN_VAL 0
#define MAX_VAL 65535
#endif
#if defined (DEPTH_3)
@@ -86,7 +86,7 @@
#define VEC_TYPE_LOC int4
#define CONVERT_TYPE convert_short4
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
#define MIN_VAL -32768
#define MIN_VAL -32768
#define MAX_VAL 32767
#endif
#if defined (DEPTH_4)
@@ -95,7 +95,7 @@
#define VEC_TYPE_LOC int4
#define CONVERT_TYPE convert_int4
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
#define MIN_VAL INT_MIN
#define MIN_VAL INT_MIN
#define MAX_VAL INT_MAX
#endif
#if defined (DEPTH_5)
@@ -104,7 +104,7 @@
#define VEC_TYPE_LOC float4
#define CONVERT_TYPE convert_float4
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
#define MIN_VAL (-FLT_MAX)
#define MIN_VAL (-FLT_MAX)
#define MAX_VAL FLT_MAX
#endif
#if defined (DEPTH_6)
@@ -113,12 +113,12 @@
#define VEC_TYPE_LOC double4
#define CONVERT_TYPE convert_double4
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
#define MIN_VAL (-DBL_MAX)
#define MIN_VAL (-DBL_MAX)
#define MAX_VAL DBL_MAX
#endif
#if defined (REPEAT_E0)
#define repeat_e(a) a=a;
#define repeat_e(a) a=a;
#endif
#if defined (REPEAT_E1)
#define repeat_e(a) a.s3 = a.s2;
@@ -194,7 +194,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
}
minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
}
@@ -225,9 +225,9 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
lm_minloc[lid] =
lm_minloc[lid] =
CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
lm_maxloc[lid] =
lm_maxloc[lid] =
CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
}
barrier(CLK_LOCAL_MEM_FENCE);

View File

@@ -59,42 +59,42 @@
#define VEC_TYPE char8
#define TYPE char
#define CONVERT_TYPE convert_char8
#define MIN_VAL -128
#define MIN_VAL -128
#define MAX_VAL 127
#endif
#if defined (DEPTH_2)
#define VEC_TYPE ushort8
#define TYPE ushort
#define CONVERT_TYPE convert_ushort8
#define MIN_VAL 0
#define MIN_VAL 0
#define MAX_VAL 65535
#endif
#if defined (DEPTH_3)
#define VEC_TYPE short8
#define TYPE short
#define CONVERT_TYPE convert_short8
#define MIN_VAL -32768
#define MIN_VAL -32768
#define MAX_VAL 32767
#endif
#if defined (DEPTH_4)
#define VEC_TYPE int8
#define TYPE int
#define CONVERT_TYPE convert_int8
#define MIN_VAL INT_MIN
#define MIN_VAL INT_MIN
#define MAX_VAL INT_MAX
#endif
#if defined (DEPTH_5)
#define VEC_TYPE float8
#define TYPE float
#define CONVERT_TYPE convert_float8
#define MIN_VAL (-FLT_MAX)
#define MIN_VAL (-FLT_MAX)
#define MAX_VAL FLT_MAX
#endif
#if defined (DEPTH_6)
#define VEC_TYPE double8
#define TYPE double
#define CONVERT_TYPE convert_double8
#define MIN_VAL (-DBL_MAX)
#define MIN_VAL (-DBL_MAX)
#define MAX_VAL DBL_MAX
#endif

View File

@@ -48,23 +48,23 @@
#endif
int4 round_int4(float4 v){
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
return convert_int4_sat(v);
}
uint4 round_uint4(float4 v){
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
return convert_uint4_sat(v);
}
long round_int(float v){
v = v + (v > 0 ? 0.5 : -0.5);
v = v + (v > 0 ? 0.5 : -0.5);
return convert_int_sat(v);
}
@@ -85,24 +85,24 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data ,src2_data;
uchar4 src1_data ,src2_data;
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
@@ -130,8 +130,8 @@ __kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offs
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -166,8 +166,8 @@ __kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offse
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);

View File

@@ -137,7 +137,7 @@ __kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnu
if(id < elemnum)
{
temp = src[idx];
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp);
}
@@ -155,7 +155,7 @@ __kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnu
{
idx = offset + id + (id / cols) * invalid_cols;
temp = src[idx];
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp);
}

View File

@@ -67,7 +67,7 @@ __kernel void arithm_phase_inradians_D5 (__global float *src1, int src1_step, in
float data1 = *((__global float *)((__global char *)src1 + src1_index));
float data2 = *((__global float *)((__global char *)src2 + src2_index));
float tmp = atan2(data2,data1);
*((__global float *)((__global char *)dst + dst_index)) = tmp;
}
@@ -92,7 +92,7 @@ __kernel void arithm_phase_inradians_D6 (__global double *src1, int src1_step, i
double data1 = *((__global double *)((__global char *)src1 + src1_index));
double data2 = *((__global double *)((__global char *)src2 + src2_index));
*((__global double *)((__global char *)dst + dst_index)) = atan2(data2,data1);
}
@@ -119,7 +119,7 @@ __kernel void arithm_phase_indegrees_D5 (__global float *src1, int src1_step, in
float data2 = *((__global float *)((__global char *)src2 + src2_index));
float tmp = atan2(data2,data1);
float tmp_data = 180*tmp/CV_PI;
*((__global float *)((__global char *)dst + dst_index)) = tmp_data;
}
@@ -146,7 +146,7 @@ __kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step, i
double data2 = *((__global double *)((__global char *)src2 + src2_index));
double tmp = atan2(data2,data1);
double tmp_data = 180*tmp/CV_PI;
*((__global double *)((__global char *)dst + dst_index)) = tmp_data;
}

View File

@@ -54,8 +54,8 @@
///////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, int src1_offset,//magnitue
__global float *src2, int src2_step, int src2_offset,//angle
__global float *dst1, int dst1_step, int dst1_offset,
__global float *dst2, int dst2_step, int dst2_offset,
__global float *dst1, int dst1_step, int dst1_offset,
__global float *dst2, int dst2_step, int dst2_offset,
int rows, int cols, int angInDegree)
{
int x = get_global_id(0);
@@ -74,7 +74,7 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in
float ascale = CV_PI/180.0;
float alpha = angInDegree == 1 ? y * ascale : y;
float a = cos(alpha) * x;
float a = cos(alpha) * x;
float b = sin(alpha) * x;
*((__global float *)((__global char *)dst1 + dst1_index)) = a;
@@ -85,8 +85,8 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, int src1_offset,//magnitue
__global double *src2, int src2_step, int src2_offset,//angle
__global double *dst1, int dst1_step, int dst1_offset,
__global double *dst2, int dst2_step, int dst2_offset,
__global double *dst1, int dst1_step, int dst1_offset,
__global double *dst2, int dst2_step, int dst2_offset,
int rows, int cols, int angInDegree)
{
int x = get_global_id(0);
@@ -105,7 +105,7 @@ __kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, i
float ascale = CV_PI/180.0;
double alpha = angInDegree == 1 ? y * ascale : y;
double a = cos(alpha) * x;
double a = cos(alpha) * x;
double b = sin(alpha) * x;
*((__global double *)((__global char *)dst1 + dst1_index)) = a;
@@ -118,8 +118,8 @@ __kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, i
/////////////////////////////////////////polarToCart without magnitude//////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int src_offset,//angle
__global float *dst1, int dst1_step, int dst1_offset,
__global float *dst2, int dst2_step, int dst2_offset,
__global float *dst1, int dst1_step, int dst1_offset,
__global float *dst2, int dst2_step, int dst2_offset,
int rows, int cols, int angInDegree)
{
int x = get_global_id(0);
@@ -136,7 +136,7 @@ __kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int sr
float ascale = CV_PI/180.0;
float alpha = angInDegree == 1 ? y * ascale : y;
float a = cos(alpha);
float a = cos(alpha);
float b = sin(alpha);
*((__global float *)((__global char *)dst1 + dst1_index)) = a;
@@ -146,8 +146,8 @@ __kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int sr
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_polarToCart_D6 (__global float *src, int src_step, int src_offset,//angle
__global float *dst1, int dst1_step, int dst1_offset,
__global float *dst2, int dst2_step, int dst2_offset,
__global float *dst1, int dst1_step, int dst1_offset,
__global float *dst2, int dst2_step, int dst2_offset,
int rows, int cols, int angInDegree)
{
int x = get_global_id(0);
@@ -164,7 +164,7 @@ __kernel void arithm_polarToCart_D6 (__global float *src, int src_step, int sr
float ascale = CV_PI/180.0;
double alpha = angInDegree == 1 ? y * ascale : y;
double a = cos(alpha);
double a = cos(alpha);
double b = sin(alpha);
*((__global double *)((__global char *)dst1 + dst1_index)) = a;

View File

@@ -70,7 +70,7 @@ __kernel void arithm_pow_D5 (__global float *src1, int src1_step, int src1_offse
float src1_data = *((__global float *)((__global char *)src1 + src1_index));
float tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
*((__global float *)((__global char *)dst + dst_index)) = tmp;
}
@@ -92,7 +92,7 @@ __kernel void arithm_pow_D6 (__global double *src1, int src1_step, int src1_offs
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
double src1_data = *((__global double *)((__global char *)src1 + src1_index));
double tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
double tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
*((__global double *)((__global char *)dst + dst_index)) = tmp;
}

View File

@@ -64,8 +64,8 @@ __kernel void arithm_sub_D0 (__global uchar *src1, int src1_step, int src1_offse
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -99,8 +99,8 @@ __kernel void arithm_sub_D2 (__global ushort *src1, int src1_step, int src1_offs
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -134,8 +134,8 @@ __kernel void arithm_sub_D3 (__global short *src1, int src1_step, int src1_offse
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -240,8 +240,8 @@ __kernel void arithm_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -279,8 +279,8 @@ __kernel void arithm_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step,
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -316,8 +316,8 @@ __kernel void arithm_sub_with_mask_C1_D3 (__global short *src1, int src1_step, i
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -362,7 +362,7 @@ __kernel void arithm_sub_with_mask_C1_D4 (__global int *src1, int src1_step, i
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = convert_int_sat((long)src_data1 - (long)src_data2);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -392,7 +392,7 @@ __kernel void arithm_sub_with_mask_C1_D5 (__global float *src1, int src1_step, i
float dst_data = *((__global float *)((__global char *)dst + dst_index));
float data = src_data1 - src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float *)((__global char *)dst + dst_index)) = data;
}
@@ -424,7 +424,7 @@ __kernel void arithm_sub_with_mask_C1_D6 (__global double *src1, int src1_step,
double dst_data = *((__global double *)((__global char *)dst + dst_index));
double data = src_data1 - src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double *)((__global char *)dst + dst_index)) = data;
}
@@ -446,8 +446,8 @@ __kernel void arithm_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -493,7 +493,7 @@ __kernel void arithm_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step,
int2 tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2);
ushort2 data = convert_ushort2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -523,7 +523,7 @@ __kernel void arithm_sub_with_mask_C2_D3 (__global short *src1, int src1_step, i
int2 tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2);
short2 data = convert_short2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -552,7 +552,7 @@ __kernel void arithm_sub_with_mask_C2_D4 (__global int *src1, int src1_step, i
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = convert_int2_sat(convert_long2_sat(src_data1) - convert_long2_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -581,7 +581,7 @@ __kernel void arithm_sub_with_mask_C2_D5 (__global float *src1, int src1_step, i
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
float2 data = src_data1 - src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float2 *)((__global char *)dst + dst_index)) = data;
}
@@ -612,7 +612,7 @@ __kernel void arithm_sub_with_mask_C2_D6 (__global double *src1, int src1_step,
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
double2 data = src_data1 - src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double2 *)((__global char *)dst + dst_index)) = data;
}
@@ -633,8 +633,8 @@ __kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -660,17 +660,17 @@ __kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) - convert_short4_sat(src2_data_2));
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -693,8 +693,8 @@ __kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step,
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -721,12 +721,12 @@ __kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step,
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -749,8 +749,8 @@ __kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, i
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -777,12 +777,12 @@ __kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, i
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -802,8 +802,8 @@ __kernel void arithm_sub_with_mask_C3_D4 (__global int *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -846,15 +846,15 @@ __kernel void arithm_sub_with_mask_C3_D5 (__global float *src1, int src1_step, i
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
@@ -892,15 +892,15 @@ __kernel void arithm_sub_with_mask_C3_D6 (__global double *src1, int src1_step,
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
@@ -949,7 +949,7 @@ __kernel void arithm_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = convert_uchar4_sat(convert_short4_sat(src_data1) - convert_short4_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -978,7 +978,7 @@ __kernel void arithm_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step,
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1007,7 +1007,7 @@ __kernel void arithm_sub_with_mask_C4_D3 (__global short *src1, int src1_step, i
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = convert_short4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1036,7 +1036,7 @@ __kernel void arithm_sub_with_mask_C4_D4 (__global int *src1, int src1_step, i
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = convert_int4_sat(convert_long4_sat(src_data1) - convert_long4_sat(src_data2));
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1065,7 +1065,7 @@ __kernel void arithm_sub_with_mask_C4_D5 (__global float *src1, int src1_step, i
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 data = src_data1 - src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float4 *)((__global char *)dst + dst_index)) = data;
}
@@ -1096,7 +1096,7 @@ __kernel void arithm_sub_with_mask_C4_D6 (__global double *src1, int src1_step,
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
double4 data = src_data1 - src_data2;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double4 *)((__global char *)dst + dst_index)) = data;
}

View File

@@ -59,7 +59,7 @@ __kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -94,7 +94,7 @@ __kernel void arithm_s_sub_C1_D2 (__global ushort *src1, int src1_step, int sr
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -127,7 +127,7 @@ __kernel void arithm_s_sub_C1_D3 (__global short *src1, int src1_step, int src
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -231,7 +231,7 @@ __kernel void arithm_s_sub_C2_D0 (__global uchar *src1, int src1_step, int src
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -385,7 +385,7 @@ __kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -395,9 +395,9 @@ __kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
@@ -416,17 +416,17 @@ __kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src
uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -447,7 +447,7 @@ __kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int sr
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -479,12 +479,12 @@ __kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int sr
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -505,7 +505,7 @@ __kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@@ -537,12 +537,12 @@ __kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -560,7 +560,7 @@ __kernel void arithm_s_sub_C3_D4 (__global int *src1, int src1_step, int src1_
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
@@ -602,13 +602,13 @@ __kernel void arithm_s_sub_C3_D5 (__global float *src1, int src1_step, int src
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = src2.x;
float src2_data_1 = src2.y;
float src2_data_2 = src2.z;
@@ -642,13 +642,13 @@ __kernel void arithm_s_sub_C3_D6 (__global double *src1, int src1_step, int sr
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = src2.x;
double src2_data_1 = src2.y;
double src2_data_2 = src2.z;

View File

@@ -62,7 +62,7 @@ __kernel void arithm_s_sub_with_mask_C1_D0 (__global uchar *src1, int src1_ste
x = x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -100,7 +100,7 @@ __kernel void arithm_s_sub_with_mask_C1_D2 (__global ushort *src1, int src1_st
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -136,7 +136,7 @@ __kernel void arithm_s_sub_with_mask_C1_D3 (__global short *src1, int src1_ste
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -182,7 +182,7 @@ __kernel void arithm_s_sub_with_mask_C1_D4 (__global int *src1, int src1_ste
long tmp = (long)src_data1 - (long)src_data2;
tmp = isMatSubScalar ? tmp : - tmp;
int data = convert_int_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
@@ -211,7 +211,7 @@ __kernel void arithm_s_sub_with_mask_C1_D5 (__global float *src1, int src1_s
float data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float *)((__global char *)dst + dst_index)) = data;
}
@@ -242,7 +242,7 @@ __kernel void arithm_s_sub_with_mask_C1_D6 (__global double *src1, int src1_
double data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double *)((__global char *)dst + dst_index)) = data;
}
@@ -262,7 +262,7 @@ __kernel void arithm_s_sub_with_mask_C2_D0 (__global uchar *src1, int src1_ste
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -302,13 +302,13 @@ __kernel void arithm_s_sub_with_mask_C2_D2 (__global ushort *src1, int src1_st
uchar mask_data = *(mask + mask_index);
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 src_data2 = (int2)(src2.x, src2.y);
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) - src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
ushort2 data = convert_ushort2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
@@ -331,13 +331,13 @@ __kernel void arithm_s_sub_with_mask_C2_D3 (__global short *src1, int src1_ste
uchar mask_data = *(mask + mask_index);
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 src_data2 = (int2)(src2.x, src2.y);
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
int2 tmp = convert_int2_sat(src_data1) - src_data2;
tmp = isMatSubScalar ? tmp : -tmp;
short2 data = convert_short2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
@@ -360,13 +360,13 @@ __kernel void arithm_s_sub_with_mask_C2_D4 (__global int *src1, int src1_step,
uchar mask_data = *(mask + mask_index);
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 src_data2 = (int2)(src2.x, src2.y);
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
tmp = isMatSubScalar ? tmp : -tmp;
int2 data = convert_int2_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
@@ -389,12 +389,12 @@ __kernel void arithm_s_sub_with_mask_C2_D5 (__global float *src1, int src1_ste
uchar mask_data = *(mask + mask_index);
float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
float2 src_data2 = (float2)(src2.x, src2.y);
float2 src_data2 = (float2)(src2.x, src2.y);
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
float2 data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float2 *)((__global char *)dst + dst_index)) = data;
}
@@ -419,12 +419,12 @@ __kernel void arithm_s_sub_with_mask_C2_D6 (__global double *src1, int src1_st
uchar mask_data = *(mask + mask_index);
double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
double2 src_data2 = (double2)(src2.x, src2.y);
double2 src_data2 = (double2)(src2.x, src2.y);
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
double2 data = src_data1 - src_data2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double2 *)((__global char *)dst + dst_index)) = data;
}
@@ -444,7 +444,7 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_ste
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -455,9 +455,9 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_ste
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
@@ -478,17 +478,17 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_ste
uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
@@ -510,7 +510,7 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_st
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -521,9 +521,9 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_st
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
uchar2 mask_data = vload2(0, mask + mask_index);
@@ -545,12 +545,12 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_st
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -572,7 +572,7 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_ste
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@@ -583,9 +583,9 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_ste
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
int2 src2_data_0 = (int2)(src2.x, src2.y);
int2 src2_data_1 = (int2)(src2.z, src2.x);
int2 src2_data_2 = (int2)(src2.y, src2.z);
uchar2 mask_data = vload2(0, mask + mask_index);
@@ -607,12 +607,12 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_ste
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
@@ -631,7 +631,7 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step,
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
@@ -639,9 +639,9 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step,
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = src2.x;
int src2_data_0 = src2.x;
int src2_data_1 = src2.y;
int src2_data_2 = src2.z;
int src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
@@ -652,7 +652,7 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step,
long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
@@ -681,17 +681,17 @@ __kernel void arithm_s_sub_with_mask_C3_D5 (__global float *src1, int src1_ste
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
float src2_data_0 = src2.x;
float src2_data_0 = src2.x;
float src2_data_1 = src2.y;
float src2_data_2 = src2.z;
float src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
@@ -729,17 +729,17 @@ __kernel void arithm_s_sub_with_mask_C3_D6 (__global double *src1, int src1_st
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
double src2_data_0 = src2.x;
double src2_data_0 = src2.x;
double src2_data_1 = src2.y;
double src2_data_2 = src2.z;
double src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
@@ -789,7 +789,7 @@ __kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_ste
tmp = isMatSubScalar ? tmp : -tmp;
uchar4 data = convert_uchar4_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
}
@@ -818,7 +818,7 @@ __kernel void arithm_s_sub_with_mask_C4_D2 (__global ushort *src1, int src1_st
tmp = isMatSubScalar ? tmp : -tmp;
ushort4 data = convert_ushort4_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
@@ -847,7 +847,7 @@ __kernel void arithm_s_sub_with_mask_C4_D3 (__global short *src1, int src1_ste
tmp = isMatSubScalar ? tmp : -tmp;
short4 data = convert_short4_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
@@ -876,7 +876,7 @@ __kernel void arithm_s_sub_with_mask_C4_D4 (__global int *src1, int src1_step,
tmp = isMatSubScalar ? tmp : -tmp;
int4 data = convert_int4_sat(tmp);
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
@@ -904,7 +904,7 @@ __kernel void arithm_s_sub_with_mask_C4_D5 (__global float *src1, int src1_ste
float4 data = src_data1 - src2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global float4 *)((__global char *)dst + dst_index)) = data;
}
@@ -933,7 +933,7 @@ __kernel void arithm_s_sub_with_mask_C4_D6 (__global double *src1, int src1_st
double4 data = src_data1 - src2;
data = isMatSubScalar ? data : -data;
data = mask_data ? data : dst_data;
data = mask_data ? data : dst_data;
*((__global double4 *)((__global char *)dst + dst_index)) = data;
}

View File

@@ -151,7 +151,7 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
if(id < elemnum)
{
temp = CONVERT_RES_TYPE(src[idx]);
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp);
}
@@ -169,7 +169,7 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
{
idx = offset + id + (id / cols) * invalid_cols;
temp = CONVERT_RES_TYPE(src[idx]);
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp);
}

View File

@@ -159,7 +159,7 @@
#define repeat_e(a,b,c) a.s3=0; a.s2=0; a.s1=0; b=0; c=0;
#endif
__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
__global VEC_TYPE *src, __global RES_TYPE *dst)
{
unsigned int lid = get_local_id(0);
@@ -176,7 +176,7 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
temp1 = CONVERT_RES_TYPE(src[idx]);
temp2 = CONVERT_RES_TYPE(src[idx+1]);
temp3 = CONVERT_RES_TYPE(src[idx+2]);
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp1,temp2,temp3);
}
@@ -201,7 +201,7 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
temp1 = CONVERT_RES_TYPE(src[idx]);
temp2 = CONVERT_RES_TYPE(src[idx+1]);
temp3 = CONVERT_RES_TYPE(src[idx+2]);
if(id % cols == 0 )
if(id % cols == 0 )
{
repeat_s(temp1,temp2,temp3);
}

View File

@@ -43,14 +43,14 @@
//
//M*/
#define TILE_DIM 32
#define BLOCK_ROWS 8
#define TILE_DIM 32
#define BLOCK_ROWS 8
#define LDS_STEP (TILE_DIM + 1)
//8UC1 is not unoptimized, as the size of write per thread is 8
//8UC1 is not unoptimized, as the size of write per thread is 8
//which will use completepath
__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int src_rows, int src_cols)
{
@@ -62,13 +62,13 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
if(src_rows == src_cols)
{
groupId_y = gp_x;
groupId_y = gp_x;
groupId_x = (gp_x + gp_y) % gs_x;
}
else
{
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
}
@@ -87,7 +87,7 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
{
int index_src = mad24(y, src_step, x);
#pragma unroll
#pragma unroll
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
{
if(y + i < src_rows)
@@ -109,14 +109,14 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
{
if((y_index + i) < src_cols)
{
*(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];
*(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];
index_dst += dst_step * BLOCK_ROWS ;
}
}
}
}
__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
__global int* dst, int dst_step, int dst_offset,
int src_rows, int src_cols)
{
@@ -128,13 +128,13 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
if(src_rows == src_cols)
{
groupId_y = gp_x;
groupId_y = gp_x;
groupId_x = (gp_x + gp_y) % gs_x;
}
else
{
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
}
@@ -153,7 +153,7 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
{
int index_src = mad24(y, src_step, (x << 2));
#pragma unroll
#pragma unroll
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
{
if(y + i < src_rows)
@@ -175,13 +175,13 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
{
if((y_index + i) < src_cols)
{
*((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
*((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
index_dst += dst_step * BLOCK_ROWS ;
}
}
}
}
__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
__global float* dst, int dst_step, int dst_offset,
int src_rows, int src_cols)
{
@@ -193,13 +193,13 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
if(src_rows == src_cols)
{
groupId_y = gp_x;
groupId_y = gp_x;
groupId_x = (gp_x + gp_y) % gs_x;
}
else
{
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
}
@@ -218,7 +218,7 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
{
int index_src = mad24(y, src_step, (x << 2));
#pragma unroll
#pragma unroll
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
{
if(y + i < src_rows)
@@ -240,14 +240,14 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
{
if((y_index + i) < src_cols)
{
*((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
*((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
index_dst += dst_step * BLOCK_ROWS ;
}
}
}
}
__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset,
__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset,
__global ushort* dst, int dst_step, int dst_offset,
int src_rows, int src_cols)
{
@@ -259,13 +259,13 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset
if(src_rows == src_cols)
{
groupId_y = gp_x;
groupId_y = gp_x;
groupId_x = (gp_x + gp_y) % gs_x;
}
else
{
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
}
@@ -284,7 +284,7 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset
{
int index_src = mad24(y, src_step, (x << 2));
#pragma unroll
#pragma unroll
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
{
if(y + i < src_rows)
@@ -306,13 +306,13 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset
{
if((y_index + i) < src_cols)
{
*((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
*((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
index_dst += dst_step * BLOCK_ROWS ;
}
}
}
}
__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
__global short* dst, int dst_step, int dst_offset,
int src_rows, int src_cols)
{
@@ -324,13 +324,13 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
if(src_rows == src_cols)
{
groupId_y = gp_x;
groupId_y = gp_x;
groupId_x = (gp_x + gp_y) % gs_x;
}
else
{
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
}
@@ -349,7 +349,7 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
{
int index_src = mad24(y, src_step, (x << 2));
#pragma unroll
#pragma unroll
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
{
if(y + i < src_rows)
@@ -371,13 +371,13 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
{
if((y_index + i) < src_cols)
{
*((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
*((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
index_dst += dst_step * BLOCK_ROWS ;
}
}
}
}
__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int src_rows, int src_cols)
{
@@ -389,13 +389,13 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
if(src_rows == src_cols)
{
groupId_y = gp_x;
groupId_y = gp_x;
groupId_x = (gp_x + gp_y) % gs_x;
}
else
{
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
}
@@ -414,7 +414,7 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
{
int index_src = mad24(y, src_step, (x << 2));
#pragma unroll
#pragma unroll
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
{
if(y + i < src_rows)
@@ -436,14 +436,14 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
{
if((y_index + i) < src_cols)
{
*((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
*((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
index_dst += dst_step * BLOCK_ROWS ;
}
}
}
}
__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
__global char* dst, int dst_step, int dst_offset,
int src_rows, int src_cols)
{
@@ -455,13 +455,13 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
if(src_rows == src_cols)
{
groupId_y = gp_x;
groupId_y = gp_x;
groupId_x = (gp_x + gp_y) % gs_x;
}
else
{
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
int bid = gp_x + gs_x * gp_y;
groupId_y = bid % gs_y;
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
}
@@ -480,7 +480,7 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
{
int index_src = mad24(y, src_step, (x << 2));
#pragma unroll
#pragma unroll
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
{
if(y + i < src_rows)
@@ -502,7 +502,7 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
{
if((y_index + i) < src_cols)
{
*((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
*((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
index_dst += dst_step * BLOCK_ROWS ;
}
}

View File

@@ -15,7 +15,7 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
// Liu Liujun, liujun@multicorewareinc.com
// Liu Liujun, liujun@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@@ -43,103 +43,103 @@
//
//M*/
__kernel void BlendLinear_C1_D0(
__global uchar *dst,
__global uchar *img1,
__global uchar *img2,
__global float *weight1,
__global float *weight2,
int rows,
int cols,
int istep,
int wstep
)
__global uchar *dst,
__global uchar *img1,
__global uchar *img2,
__global float *weight1,
__global float *weight2,
int rows,
int cols,
int istep,
int wstep
)
{
int idx = get_global_id(0);
int idy = get_global_id(1);
if (idx < cols && idy < rows)
{
int pos = mad24(idy,istep,idx);
int wpos = mad24(idy,wstep,idx);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
int idx = get_global_id(0);
int idy = get_global_id(1);
if (idx < cols && idy < rows)
{
int pos = mad24(idy,istep,idx);
int wpos = mad24(idy,wstep,idx);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
}
}
}
__kernel void BlendLinear_C4_D0(
__global uchar *dst,
__global uchar *img1,
__global uchar *img2,
__global float *weight1,
__global float *weight2,
int rows,
int cols,
int istep,
int wstep
)
__global uchar *dst,
__global uchar *img1,
__global uchar *img2,
__global float *weight1,
__global float *weight2,
int rows,
int cols,
int istep,
int wstep
)
{
int idx = get_global_id(0);
int idy = get_global_id(1);
int x = idx / 4;
int y = idy;
if (x < cols && y < rows)
{
int pos = mad24(idy,istep,idx);
int wpos = mad24(idy,wstep,x);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
}
int idx = get_global_id(0);
int idy = get_global_id(1);
int x = idx / 4;
int y = idy;
if (x < cols && y < rows)
{
int pos = mad24(idy,istep,idx);
int wpos = mad24(idy,wstep,x);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
}
}
__kernel void BlendLinear_C1_D5(
__global float *dst,
__global float *img1,
__global float *img2,
__global float *weight1,
__global float *weight2,
int rows,
int cols,
int istep,
int wstep
)
__global float *dst,
__global float *img1,
__global float *img2,
__global float *weight1,
__global float *weight2,
int rows,
int cols,
int istep,
int wstep
)
{
int idx = get_global_id(0);
int idy = get_global_id(1);
if (idx < cols && idy < rows)
{
int pos = mad24(idy,istep,idx);
int wpos = mad24(idy,wstep,idx);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
}
int idx = get_global_id(0);
int idy = get_global_id(1);
if (idx < cols && idy < rows)
{
int pos = mad24(idy,istep,idx);
int wpos = mad24(idy,wstep,idx);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
}
}
__kernel void BlendLinear_C4_D5(
__global float *dst,
__global float *img1,
__global float *img2,
__global float *weight1,
__global float *weight2,
int rows,
int cols,
int istep,
int wstep
)
__global float *dst,
__global float *img1,
__global float *img2,
__global float *weight1,
__global float *weight2,
int rows,
int cols,
int istep,
int wstep
)
{
int idx = get_global_id(0);
int idy = get_global_id(1);
int x = idx / 4;
int y = idy;
if (x < cols && y < rows)
{
int pos = mad24(idy,istep,idx);
int wpos = mad24(idy,wstep,x);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
}
int idx = get_global_id(0);
int idy = get_global_id(1);
int x = idx / 4;
int y = idy;
if (x < cols && y < rows)
{
int pos = mad24(idy,istep,idx);
int wpos = mad24(idy,wstep,x);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,237 +1,237 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
__kernel
void buildWarpPlaneMaps
(
__global float * map_x,
__global float * map_y,
__constant float * KRT,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
__constant float * ck_rinv = KRT;
__constant float * ct = KRT + 9;
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
float x_ = u / scale - ct[0];
float y_ = v / scale - ct[1];
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
x /= z;
y /= z;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpCylindricalMaps
(
__global float * map_x,
__global float * map_y,
__constant float * ck_rinv,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
u /= scale;
float x_ = sin(u);
float y_ = v / scale;
float z_ = cos(u);
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
if (z > 0) { x /= z; y /= z; }
else x = y = -1;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpSphericalMaps
(
__global float * map_x,
__global float * map_y,
__constant float * ck_rinv,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
v /= scale;
u /= scale;
float sinv = sin(v);
float x_ = sinv * sin(u);
float y_ = - cos(v);
float z_ = sinv * cos(u);
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
if (z > 0) { x /= z; y /= z; }
else x = y = -1;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpAffineMaps
(
__global float * xmap,
__global float * ymap,
__constant float * c_warpMat,
int cols,
int rows,
int step_x,
int step_y
)
{
int x = get_global_id(0);
int y = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (x < cols && y < rows)
{
const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
map_x[y * step_x + x] = xcoo;
map_y[y * step_y + x] = ycoo;
}
}
__kernel
void buildWarpPerspectiveMaps
(
__global float * xmap,
__global float * ymap,
__constant float * c_warpMat,
int cols,
int rows,
int step_x,
int step_y
)
{
int x = get_global_id(0);
int y = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (x < cols && y < rows)
{
const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
map_x[y * step_x + x] = xcoo;
map_y[y * step_y + x] = ycoo;
}
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
__kernel
void buildWarpPlaneMaps
(
__global float * map_x,
__global float * map_y,
__constant float * KRT,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
__constant float * ck_rinv = KRT;
__constant float * ct = KRT + 9;
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
float x_ = u / scale - ct[0];
float y_ = v / scale - ct[1];
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
x /= z;
y /= z;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpCylindricalMaps
(
__global float * map_x,
__global float * map_y,
__constant float * ck_rinv,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
u /= scale;
float x_ = sin(u);
float y_ = v / scale;
float z_ = cos(u);
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
if (z > 0) { x /= z; y /= z; }
else x = y = -1;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpSphericalMaps
(
__global float * map_x,
__global float * map_y,
__constant float * ck_rinv,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
v /= scale;
u /= scale;
float sinv = sin(v);
float x_ = sinv * sin(u);
float y_ = - cos(v);
float z_ = sinv * cos(u);
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
if (z > 0) { x /= z; y /= z; }
else x = y = -1;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpAffineMaps
(
__global float * xmap,
__global float * ymap,
__constant float * c_warpMat,
int cols,
int rows,
int step_x,
int step_y
)
{
int x = get_global_id(0);
int y = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (x < cols && y < rows)
{
const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
map_x[y * step_x + x] = xcoo;
map_y[y * step_y + x] = ycoo;
}
}
__kernel
void buildWarpPerspectiveMaps
(
__global float * xmap,
__global float * ymap,
__constant float * c_warpMat,
int cols,
int rows,
int step_x,
int step_y
)
{
int x = get_global_id(0);
int y = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (x < cols && y < rows)
{
const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
map_x[y * step_x + x] = xcoo;
map_y[y * step_y + x] = ycoo;
}
}

View File

@@ -36,106 +36,106 @@
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int dstStep_in_piexl,int pixel_end)
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int dstStep_in_piexl,int pixel_end)
{
int id = get_global_id(0);
//int pixel_end = mul24(cols -1 , rows -1);
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
pixelid = clamp(pixelid,0,pixel_end);
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
pixel0 = src[pixelid.x];
pixel1 = src[pixelid.y];
pixel2 = src[pixelid.z];
int id = get_global_id(0);
//int pixel_end = mul24(cols -1 , rows -1);
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
pixelid = clamp(pixelid,0,pixel_end);
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
pixel0 = src[pixelid.x];
pixel1 = src[pixelid.y];
pixel2 = src[pixelid.z];
outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
int4 outy = (id<<2)/cols;
int4 outx = (id<<2)%cols;
outx.y++;
outx.z+=2;
outx.w+=3;
outy = select(outy,outy+1,outx>=cols);
outx = select(outx,outx-cols,outx>=cols);
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
//outx = select(outx,(int4)outx.x,outy>=rows);
//outy = select(outy,(int4)outy.x,outy>=rows);
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
if(outx.w<cols && outy.w<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
dst[addr.z] = outpix2;
dst[addr.w] = outpix3;
}
else if(outx.z<cols && outy.z<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
dst[addr.z] = outpix2;
}
else if(outx.y<cols && outy.y<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
}
else if(outx.x<cols && outy.x<rows)
{
dst[addr.x] = outpix0;
}
int4 outy = (id<<2)/cols;
int4 outx = (id<<2)%cols;
outx.y++;
outx.z+=2;
outx.w+=3;
outy = select(outy,outy+1,outx>=cols);
outx = select(outx,outx-cols,outx>=cols);
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
//outx = select(outx,(int4)outx.x,outy>=rows);
//outy = select(outy,(int4)outy.x,outy>=rows);
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
if(outx.w<cols && outy.w<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
dst[addr.z] = outpix2;
dst[addr.w] = outpix3;
}
else if(outx.z<cols && outy.z<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
dst[addr.z] = outpix2;
}
else if(outx.y<cols && outy.y<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
}
else if(outx.x<cols && outy.x<rows)
{
dst[addr.x] = outpix0;
}
}
__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int srcStep_in_pixel,int pixel_end)
__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int srcStep_in_pixel,int pixel_end)
{
int id = get_global_id(0)<<2;
int y = id / cols;
int x = id % cols;
int4 x4 = (int4)(x,x+1,x+2,x+3);
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
y4=clamp(y4,(int4)0,(int4)(rows-1));
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
pixel0 = src[addr.x];
pixel1 = src[addr.y];
pixel2 = src[addr.z];
pixel3 = src[addr.w];
int id = get_global_id(0)<<2;
int y = id / cols;
int x = id % cols;
int4 x4 = (int4)(x,x+1,x+2,x+3);
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
y4=clamp(y4,(int4)0,(int4)(rows-1));
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
pixel0 = src[addr.x];
pixel1 = src[addr.y];
pixel2 = src[addr.z];
pixel3 = src[addr.w];
pixel0.w = pixel1.x;
outpixel1.x = pixel1.y;
outpixel1.y = pixel1.z;
outpixel1.z = pixel2.x;
outpixel1.w = pixel2.y;
outpixel2.x = pixel2.z;
outpixel2.y = pixel3.x;
outpixel2.z = pixel3.y;
outpixel2.w = pixel3.z;
int4 outaddr = mul24(id>>2 , 3);
outaddr.y++;
outaddr.z+=2;
if(outaddr.z <= pixel_end)
{
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
dst[outaddr.z] = outpixel2;
}
else if(outaddr.y <= pixel_end)
{
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
}
else if(outaddr.x <= pixel_end)
{
dst[outaddr.x] = pixel0;
}
pixel0.w = pixel1.x;
outpixel1.x = pixel1.y;
outpixel1.y = pixel1.z;
outpixel1.z = pixel2.x;
outpixel1.w = pixel2.y;
outpixel2.x = pixel2.z;
outpixel2.y = pixel3.x;
outpixel2.z = pixel3.y;
outpixel2.w = pixel3.z;
int4 outaddr = mul24(id>>2 , 3);
outaddr.y++;
outaddr.z+=2;
if(outaddr.z <= pixel_end)
{
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
dst[outaddr.z] = outpixel2;
}
else if(outaddr.y <= pixel_end)
{
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
}
else if(outaddr.x <= pixel_end)
{
dst[outaddr.x] = pixel0;
}
}

View File

@@ -78,4 +78,4 @@ __kernel void RGB2Gray(int cols,int rows,int src_step,int dst_step,int channels,
int dst_idx = y * dst_step + x * sizeof(DATA_TYPE);
dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift);
}
}
}

View File

@@ -83,7 +83,7 @@ Now(6/29/2011) the kernels only support 8U data type and the anchor of the convo
kernel must be in the center. ROI is not supported either.
Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
from LDS to calculate the result.
The length of the convovle kernel supported is only related to the MAX size of LDS,
The length of the convovle kernel supported is only related to the MAX size of LDS,
which is HW related.
Niko
6/29/2011
@@ -92,56 +92,56 @@ The info above maybe obsolete.
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
(__global const GENTYPE_SRC * restrict src,
__global GENTYPE_DST * dst,
(__global const GENTYPE_SRC * restrict src,
__global GENTYPE_DST * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
//const int src_offset_x,
//const int src_offset_y,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
//const int src_offset_x,
//const int src_offset_y,
const int dst_step_in_pixel,
const int dst_offset_in_pixel,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_addr = mad24(y,src_step_in_pixel,x);
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
int i;
GENTYPE_SRC sum;
GENTYPE_SRC temp[READ_TIMES_COL];
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_addr = mad24(y,src_step_in_pixel,x);
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
int i;
GENTYPE_SRC sum;
GENTYPE_SRC temp[READ_TIMES_COL];
__local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
__local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
//read pixels from src
for(i = 0;i<READ_TIMES_COL;i++)
{
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
current_addr = current_addr < end_addr ? current_addr : 0;
temp[i] = src[current_addr];
}
//save pixels to lds
for(i = 0;i<READ_TIMES_COL;i++)
{
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
for(i=1;i<=RADIUSY;i++)
{
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
dst[start_addr] = convert_to_DST(sum);
}
//read pixels from src
for(i = 0;i<READ_TIMES_COL;i++)
{
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
current_addr = current_addr < end_addr ? current_addr : 0;
temp[i] = src[current_addr];
}
//save pixels to lds
for(i = 0;i<READ_TIMES_COL;i++)
{
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
for(i=1;i<=RADIUSY;i++)
{
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
dst[start_addr] = convert_to_DST(sum);
}
}

View File

@@ -83,7 +83,7 @@ These kernels are written for separable filters such as Sobel, Scharr, GaussianB
Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
kernel must be in the center. ROI is not supported either.
For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3,
the kernel read 4 pixels, save them to LDS and read the data needed from LDS to
the kernel read 4 pixels, save them to LDS and read the data needed from LDS to
calculate the result.
The length of the convovle kernel supported is related to the LSIZE0 and the MAX size
of LDS, which is HW related.
@@ -96,375 +96,375 @@ The info above maybe obsolete.
***********************************************************************************/
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
(__global const uchar * restrict src,
__global float * dst,
(__global const uchar * restrict src,
__global float * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_step_in_pixel,
const int radiusy,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
{
int x = get_global_id(0)<<2;
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
int offset = src_offset_x-RADIUSX & 3;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
uchar4 temp[READ_TIMES_ROW];
int x = get_global_id(0)<<2;
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
int offset = src_offset_x-RADIUSX & 3;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
uchar4 temp[READ_TIMES_ROW];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
int current_addr = start_addr+i*LSIZE0*4;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = *(__global uchar4*)&src[current_addr];
}
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
temp[i].z= ELEM(start_x+i*LSIZE0*4+2,0,src_whole_cols,0,temp[i].z);
temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
}
#else
int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
int4 index[READ_TIMES_ROW];
int4 addr;
int s_y;
if(not_all_in_range)
{
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
index[i].y= ADDR_L(start_x+i*LSIZE0*4+1,0,src_whole_cols,start_x+i*LSIZE0*4+1);
index[i].y= ADDR_R(start_x+i*LSIZE0*4+1,src_whole_cols,index[i].y);
index[i].z= ADDR_L(start_x+i*LSIZE0*4+2,0,src_whole_cols,start_x+i*LSIZE0*4+2);
index[i].z= ADDR_R(start_x+i*LSIZE0*4+2,src_whole_cols,index[i].z);
index[i].w= ADDR_L(start_x+i*LSIZE0*4+3,0,src_whole_cols,start_x+i*LSIZE0*4+3);
index[i].w= ADDR_R(start_x+i*LSIZE0*4+3,src_whole_cols,index[i].w);
}
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
temp[i].x = src[addr.x];
temp[i].y = src[addr.y];
temp[i].z = src[addr.z];
temp[i].w = src[addr.w];
}
}
else
{
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
}
}
#endif
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
int current_addr = start_addr+i*LSIZE0*4;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = *(__global uchar4*)&src[current_addr];
}
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
temp[i].z= ELEM(start_x+i*LSIZE0*4+2,0,src_whole_cols,0,temp[i].z);
temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
}
#else
int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
int4 index[READ_TIMES_ROW];
int4 addr;
int s_y;
if(not_all_in_range)
{
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
index[i].y= ADDR_L(start_x+i*LSIZE0*4+1,0,src_whole_cols,start_x+i*LSIZE0*4+1);
index[i].y= ADDR_R(start_x+i*LSIZE0*4+1,src_whole_cols,index[i].y);
index[i].z= ADDR_L(start_x+i*LSIZE0*4+2,0,src_whole_cols,start_x+i*LSIZE0*4+2);
index[i].z= ADDR_R(start_x+i*LSIZE0*4+2,src_whole_cols,index[i].z);
index[i].w= ADDR_L(start_x+i*LSIZE0*4+3,0,src_whole_cols,start_x+i*LSIZE0*4+3);
index[i].w= ADDR_R(start_x+i*LSIZE0*4+3,src_whole_cols,index[i].w);
}
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
temp[i].x = src[addr.x];
temp[i].y = src[addr.y];
temp[i].z = src[addr.z];
temp[i].w = src[addr.w];
}
}
else
{
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
}
}
#endif
//save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++)
{
temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
}
start_addr = mad24(y,dst_step_in_pixel,x);
//write the result to dst
if((x+3<dst_cols) & (y<dst_rows))
{
*(__global float4*)&dst[start_addr] = sum;
}
else if((x+2<dst_cols) & (y<dst_rows))
{
dst[start_addr] = sum.x;
dst[start_addr+1] = sum.y;
dst[start_addr+2] = sum.z;
}
else if((x+1<dst_cols) & (y<dst_rows))
{
dst[start_addr] = sum.x;
dst[start_addr+1] = sum.y;
}
else if((x<dst_cols) & (y<dst_rows))
{
dst[start_addr] = sum.x;
}
//read pixels from lds and calculate the result
sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++)
{
temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
}
start_addr = mad24(y,dst_step_in_pixel,x);
//write the result to dst
if((x+3<dst_cols) & (y<dst_rows))
{
*(__global float4*)&dst[start_addr] = sum;
}
else if((x+2<dst_cols) & (y<dst_rows))
{
dst[start_addr] = sum.x;
dst[start_addr+1] = sum.y;
dst[start_addr+2] = sum.z;
}
else if((x+1<dst_cols) & (y<dst_rows))
{
dst[start_addr] = sum.x;
dst[start_addr+1] = sum.y;
}
else if((x<dst_cols) & (y<dst_rows))
{
dst[start_addr] = sum.x;
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
(__global const uchar4 * restrict src,
__global float4 * dst,
(__global const uchar4 * restrict src,
__global float4 * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_step_in_pixel,
const int radiusy,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
uchar4 temp[READ_TIMES_ROW];
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
uchar4 temp[READ_TIMES_ROW];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
}
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y);
index[i]=mad24(s_y,src_step_in_pixel,s_x);
}
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i] = src[index[i]];
}
#endif
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
}
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y);
index[i]=mad24(s_y,src_step_in_pixel,s_x);
}
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i] = src[index[i]];
}
#endif
//save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
//read pixels from lds and calculate the result
sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
(__global const float * restrict src,
__global float * dst,
(__global const float * restrict src,
__global float * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_step_in_pixel,
const int radiusy,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float sum;
float temp[READ_TIMES_ROW];
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float sum;
float temp[READ_TIMES_ROW];
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
}
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y);
index[i]=mad24(s_y,src_step_in_pixel,s_x);
}
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i] = src[index[i]];
}
#endif
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
}
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y);
index[i]=mad24(s_y,src_step_in_pixel,s_x);
}
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i] = src[index[i]];
}
#endif
//save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
//read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
(__global const float4 * restrict src,
__global float4 * dst,
(__global const float4 * restrict src,
__global float4 * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_step_in_pixel,
const int radiusy,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
float4 temp[READ_TIMES_ROW];
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
float4 temp[READ_TIMES_ROW];
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
}
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y);
index[i]=mad24(s_y,src_step_in_pixel,s_x);
}
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i] = src[index[i]];
}
#endif
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
}
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
//judge if read out of boundary
for(i = 0;i<READ_TIMES_ROW;i++)
{
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
s_y= ADDR_R(start_y,src_whole_rows,s_y);
index[i]=mad24(s_y,src_step_in_pixel,s_x);
}
//read pixels from src
for(i = 0;i<READ_TIMES_ROW;i++)
{
temp[i] = src[index[i]];
}
#endif
//save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//save pixels to lds
for(i = 0;i<READ_TIMES_ROW;i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
//read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for(i=1;i<=RADIUSX;i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
}

View File

@@ -50,8 +50,8 @@
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
@@ -103,12 +103,12 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
uint4 data[ksY+1];
__local uint4 temp[(THREADS<<1)];
__local uint4 temp[(THREADS<<1)];
#ifdef BORDER_CONSTANT
for(int i=0; i < ksY+1; i++)
{
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
@@ -126,15 +126,15 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
}
}
#else
int not_all_in_range;
for(int i=0; i < ksY+1; i++)
{
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
| (startY+i<0) | (startY+i>src_whole_rows-1);
if(not_all_in_range)
{
{
int selected_row;
int4 selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
@@ -142,13 +142,13 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
@@ -174,7 +174,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
temp[col] = sum1;
temp[col+THREADS] = sum2;
barrier(CLK_LOCAL_MEM_FENCE);
if(col >= anX && col < (THREADS-ksX+anX+1))
{
int posX = dst_startX - dst_x_off + (col-anX)*4;
@@ -189,7 +189,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
{
tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i);
}
if(posY < dst_rows && posX < dst_cols)
{
if(posX >= 0 && posX < dst_cols)
@@ -200,7 +200,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha;
}
}
if(posY+1 < dst_rows && posX < dst_cols)
{
dst_startY+=1;
@@ -212,9 +212,9 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha;
}
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -237,12 +237,12 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
//int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
int dst_startY = (gY << 1) + dst_y_off;
//int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
int end_addr = src_whole_cols-4;
int end_addr = src_whole_cols-4;
uint4 data[ksY+1];
__local uint4 temp[2][THREADS];
__local uint4 temp[2][THREADS];
#ifdef BORDER_CONSTANT
bool con;
uint4 ss;
@@ -250,12 +250,12 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
//ss = convert_uint4(src[cur_addr]);
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
//ss = convert_uint4(src[cur_addr]);
int cur_col = clamp(startX + col, 0, src_whole_cols);
if(con)
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);
data[i] = con ? ss : 0;
}
@@ -269,11 +269,11 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
}
#endif
uint4 sum0 = 0, sum1 = 0, sum2 = 0;
for(int i=1; i < ksY; i++)
@@ -290,7 +290,7 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
@@ -298,11 +298,11 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
tmp_sum[k] += temp[k][col+i];
}
for(int i=0; i<2; i++)
{
{
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha);
}
}
}
@@ -326,21 +326,21 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
int dst_startY = (gY << 1) + dst_y_off;
int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
float data[ksY+1];
__local float temp[2][THREADS];
__local float temp[2][THREADS];
#ifdef BORDER_CONSTANT
bool con;
float ss;
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
//ss = src[cur_addr];
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
//ss = src[cur_addr];
int cur_col = clamp(startX + col, 0, src_whole_cols);
//ss = src[(startY+i)*(src_step>>2) + cur_col];
//ss = src[(startY+i)*(src_step>>2) + cur_col];
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:0;
data[i] = con ? ss : 0.f;
@@ -355,10 +355,10 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = src[selected_row * (src_step>>2) + selected_col];
}
#endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
for(int i=1; i < ksY; i++)
@@ -375,7 +375,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
float tmp_sum[2]={0.0, 0.0};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
@@ -383,11 +383,11 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
tmp_sum[k] += temp[k][col+i];
}
for(int i=0; i<2; i++)
{
{
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
}
}
}
@@ -411,21 +411,21 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
int dst_startY = (gY << 1) + dst_y_off;
int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
float4 data[ksY+1];
__local float4 temp[2][THREADS];
__local float4 temp[2][THREADS];
#ifdef BORDER_CONSTANT
bool con;
float4 ss;
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
//int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);
//ss = src[cur_addr];
//int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);
//ss = src[cur_addr];
int cur_col = clamp(startX + col, 0, src_whole_cols);
//ss = src[(startY+i)*(src_step>>4) + cur_col];
//ss = src[(startY+i)*(src_step>>4) + cur_col];
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:0;
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
@@ -440,10 +440,10 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = src[selected_row * (src_step>>4) + selected_col];
}
#endif
float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
for(int i=1; i < ksY; i++)
@@ -460,7 +460,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
@@ -468,10 +468,10 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
tmp_sum[k] += temp[k][col+i];
}
for(int i=0; i<2; i++)
{
{
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
}
}
}

View File

@@ -51,8 +51,8 @@
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
@@ -90,10 +90,10 @@
#define ROWS_PER_GROUP_BITS 2
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
#define THREADS_PER_ROW 64
#define THREADS_PER_ROW_BIT 6
#define THREADS_PER_ROW 64
#define THREADS_PER_ROW_BIT 6
#define ELEMENTS_PER_THREAD 4
#define ELEMENTS_PER_THREAD 4
#define ELEMENTS_PER_THREAD_BIT 2
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
@@ -101,10 +101,10 @@
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y,
__global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y,
__global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
int cols,int rows, int operate_cols, int wholecols, int wholerows)
int cols,int rows, int operate_cols, int wholecols, int wholerows)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
@@ -114,16 +114,16 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
__local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH];
if((gY << 2) < rows)
{
for(int i = 0; i < ROWS_FETCH; ++i)
{
if((rows_start_index - src_offset_y) + i < rows + ANY)
if((rows_start_index - src_offset_y) + i < rows + ANY)
{
#ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i;
@@ -132,7 +132,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
uchar data = *(src + selected_row * src_step + selected_cols);
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
if(lX < (ANX << 1))
{
@@ -141,7 +141,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
data = *(src + selected_row * src_step + selected_cols);
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
@@ -152,7 +152,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
uchar data = *(src + selected_row * src_step + selected_cols);
local_data[i * LOCAL_MEM_STEP + lX ] =data;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
if(lX < (ANX << 1))
{
@@ -160,7 +160,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = *(src + selected_row * src_step + selected_cols);
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#endif
}
@@ -171,9 +171,9 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
if(((gY << 2) < rows) && (process_col < operate_cols))
{
int dst_cols_start = dst_offset_x;
int dst_cols_start = dst_offset_x;
int dst_cols_end = dst_offset_x + cols;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_rows_end = dst_offset_y + rows;
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
@@ -191,9 +191,9 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
if(dst_rows_index < dst_rows_end)
{
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
}
}
@@ -205,17 +205,17 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y,
__global float *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y,
__global float *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
int cols,int rows, int operate_cols, int wholecols, int wholerows)
int cols,int rows, int operate_cols, int wholecols, int wholerows)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
@@ -225,16 +225,16 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
__local float local_data[LOCAL_MEM_STEP * ROWS_FETCH];
if(((gY << 2) < rows))
{
for(int i = 0; i < ROWS_FETCH; ++i)
{
if((rows_start_index - src_offset_y) + i < rows + ANY)
if((rows_start_index - src_offset_y) + i < rows + ANY)
{
#ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i;
@@ -243,7 +243,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
if(lX < (ANX << 1))
{
@@ -252,7 +252,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
@@ -262,7 +262,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX] =data;
local_data[i * LOCAL_MEM_STEP + lX] =data;
if(lX < (ANX << 1))
{
@@ -270,7 +270,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#endif
}
@@ -281,9 +281,9 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
if(((gY << 2) < rows) && (process_col < operate_cols))
{
int dst_cols_start = dst_offset_x;
int dst_cols_start = dst_offset_x;
int dst_cols_end = dst_offset_x + cols;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_rows_end = dst_offset_y + rows;
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
@@ -301,9 +301,9 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
if(dst_rows_index < dst_rows_end)
{
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
sum = sum + (mat_kernel[i * ANCHOR + j] * data);
}
}
@@ -316,7 +316,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
}
}
}
@@ -324,10 +324,10 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y,
__global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y,
__global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
int cols,int rows, int operate_cols, int wholecols, int wholerows)
int cols,int rows, int operate_cols, int wholecols, int wholerows)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
@@ -337,17 +337,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0);
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
#define dst_align (dst_offset_x & 3)
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
__local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH];
if(((gY << 2) < rows))
{
for(int i = 0; i < ROWS_FETCH; ++i)
{
if((rows_start_index - src_offset_y) + i < rows + ANY)
if((rows_start_index - src_offset_y) + i < rows + ANY)
{
#ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i;
@@ -356,7 +356,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
if(lX < (ANX << 1))
{
@@ -365,7 +365,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
@@ -376,7 +376,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX] =data;
local_data[i * LOCAL_MEM_STEP + lX] =data;
if(lX < (ANX << 1))
{
@@ -384,7 +384,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#endif
}
@@ -395,9 +395,9 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
if(((gY << 2) < rows) && (process_col < operate_cols))
{
int dst_cols_start = dst_offset_x;
int dst_cols_start = dst_offset_x;
int dst_cols_end = dst_offset_x + cols;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
int dst_rows_end = dst_offset_y + rows;
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
@@ -416,9 +416,9 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
if(dst_rows_index < dst_rows_end)
{
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
}
}
@@ -427,16 +427,16 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
if(dst_rows_index < dst_rows_end)
{
uchar16 sum1 = convert_uchar16_sat(sum);
sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?
sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?
sum1.s0123 : dst_data.s0123;
sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))?
sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))?
sum1.s4567 : dst_data.s4567;
sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))?
sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))?
sum1.s89ab : dst_data.s89ab;
sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))?
sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))?
sum1.scdef : dst_data.scdef;
*((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1;
*((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1;
}
}
}
@@ -445,10 +445,10 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
////////////////////////////////////////////////////////////////////////////////////////////////////
#define ROWS_FETCH_C4 (1 + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
#define LOCAL_MEM_STEP_C4 260 //divup((get_local_size(0) + anX * 2), 4) * 4)
__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y,
__global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y,
__global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
int cols,int rows, int operate_cols, int wholecols, int wholerows)
int cols,int rows, int operate_cols, int wholecols, int wholerows)
{
int gX = get_global_id(0);
int gY = get_global_id(1);
@@ -458,15 +458,15 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
int groupX_size = get_local_size(0);
int groupX_id = get_group_id(0);
int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + gY - ANY;
int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX;
int rows_start_index = src_offset_y + gY - ANY;
__local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4];
if((gY < rows) && (gX < (operate_cols + ANX + ANX)))
{
for(int i = 0; i < ROWS_FETCH_C4; ++i)
{
if((rows_start_index - src_offset_y) + i < rows + ANY)
if((rows_start_index - src_offset_y) + i < rows + ANY)
{
#ifdef BORDER_CONSTANT
int selected_row = rows_start_index + i;
@@ -475,7 +475,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
local_data[i * LOCAL_MEM_STEP + lX ] =data;
if(lX < (ANX << 1))
{
@@ -484,7 +484,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
data = con ? data : 0;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
}
#else
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
@@ -494,7 +494,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
local_data[i * LOCAL_MEM_STEP_C4 + lX] =data;
local_data[i * LOCAL_MEM_STEP_C4 + lX] =data;
if(lX < (ANX << 1))
{
@@ -502,7 +502,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
}
#endif
}
@@ -512,7 +512,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
if((gY < rows) && (gX < operate_cols))
{
int dst_cols_index = dst_offset_x + gX;
int dst_cols_index = dst_offset_x + gX;
int dst_rows_index = dst_offset_y + gY;
float4 sum = (float4)(0);
@@ -521,11 +521,11 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
{
for(int j = 0; j < ANCHOR; j++)
{
int local_cols = lX + j;
int local_cols = lX + j;
sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols];
}
}
*((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum;
*((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum;
}
}

View File

@@ -45,160 +45,160 @@
#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
#ifndef GENTYPE
__kernel void morph_C1_D0(__global const uchar * restrict src,
__global uchar *dst,
int src_offset_x, int src_offset_y,
int cols, int rows,
int src_step_in_pixel, int dst_step_in_pixel,
__constant uchar * mat_kernel,
int src_whole_cols, int src_whole_rows,
int dst_offset_in_pixel)
__global uchar *dst,
int src_offset_x, int src_offset_y,
int cols, int rows,
int src_step_in_pixel, int dst_step_in_pixel,
__constant uchar * mat_kernel,
int src_whole_cols, int src_whole_rows,
int dst_offset_in_pixel)
{
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int x = get_group_id(0)*4*LSIZE0;
int y = get_group_id(1)*LSIZE1;
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
int width = (end_x -start_x+4)>>2;
int offset = src_offset_x-RADIUSX & 3;
int start_y = y+src_offset_y-RADIUSY;
int point1 = mad24(l_y,LSIZE0,l_x);
int point2 = point1 + LSIZE0*LSIZE1;
int tl_x = (point1 % width)<<2;
int tl_y = point1 / width;
int tl_x2 = (point2 % width)<<2;
int tl_y2 = point2 / width;
int cur_x = start_x + tl_x;
int cur_y = start_y + tl_y;
int cur_x2 = start_x + tl_x2;
int cur_y2 = start_y + tl_y2;
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
uchar4 temp0,temp1;
__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int x = get_group_id(0)*4*LSIZE0;
int y = get_group_id(1)*LSIZE1;
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
int width = (end_x -start_x+4)>>2;
int offset = src_offset_x-RADIUSX & 3;
int start_y = y+src_offset_y-RADIUSY;
int point1 = mad24(l_y,LSIZE0,l_x);
int point2 = point1 + LSIZE0*LSIZE1;
int tl_x = (point1 % width)<<2;
int tl_y = point1 / width;
int tl_x2 = (point2 % width)<<2;
int tl_y2 = point2 / width;
int cur_x = start_x + tl_x;
int cur_y = start_y + tl_y;
int cur_x2 = start_x + tl_x2;
int cur_y2 = start_y + tl_y2;
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
uchar4 temp0,temp1;
__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
temp0 = *(__global uchar4*)&src[start_addr];
temp1 = *(__global uchar4*)&src[start_addr2];
//judge if read out of boundary
temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
temp0 = *(__global uchar4*)&src[start_addr];
temp1 = *(__global uchar4*)&src[start_addr2];
//judge if read out of boundary
temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
LDS_DAT[point1] = temp0;
LDS_DAT[point2] = temp1;
barrier(CLK_LOCAL_MEM_FENCE);
uchar4 res = (uchar4)VAL;
for(int i=0;i<2*RADIUSY+1;i++)
for(int j=0;j<2*RADIUSX+1;j++)
{
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
}
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
{
*(__global uchar4*)&dst[out_addr] = res;
}
else
{
if(gidx+3<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
dst[out_addr+2] = res.z;
dst[out_addr+3] = res.w;
}
else if(gidx+2<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
dst[out_addr+2] = res.z;
}
else if(gidx+1<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
}
else if(gidx<cols && gidy<rows)
{
dst[out_addr] = res.x;
}
}
LDS_DAT[point1] = temp0;
LDS_DAT[point2] = temp1;
barrier(CLK_LOCAL_MEM_FENCE);
uchar4 res = (uchar4)VAL;
for(int i=0;i<2*RADIUSY+1;i++)
for(int j=0;j<2*RADIUSX+1;j++)
{
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
}
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
{
*(__global uchar4*)&dst[out_addr] = res;
}
else
{
if(gidx+3<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
dst[out_addr+2] = res.z;
dst[out_addr+3] = res.w;
}
else if(gidx+2<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
dst[out_addr+2] = res.z;
}
else if(gidx+1<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
}
else if(gidx<cols && gidy<rows)
{
dst[out_addr] = res.x;
}
}
}
#else
__kernel void morph(__global const GENTYPE * restrict src,
__global GENTYPE *dst,
int src_offset_x, int src_offset_y,
int cols, int rows,
int src_step_in_pixel, int dst_step_in_pixel,
__constant uchar * mat_kernel,
int src_whole_cols, int src_whole_rows,
int dst_offset_in_pixel)
__global GENTYPE *dst,
int src_offset_x, int src_offset_y,
int cols, int rows,
int src_step_in_pixel, int dst_step_in_pixel,
__constant uchar * mat_kernel,
int src_whole_cols, int src_whole_rows,
int dst_offset_in_pixel)
{
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int x = get_group_id(0)*LSIZE0;
int y = get_group_id(1)*LSIZE1;
int start_x = x+src_offset_x-RADIUSX;
int end_x = x + src_offset_x+LSIZE0+RADIUSX;
int width = end_x -start_x+1;
int start_y = y+src_offset_y-RADIUSY;
int point1 = mad24(l_y,LSIZE0,l_x);
int point2 = point1 + LSIZE0*LSIZE1;
int tl_x = point1 % width;
int tl_y = point1 / width;
int tl_x2 = point2 % width;
int tl_y2 = point2 / width;
int cur_x = start_x + tl_x;
int cur_y = start_y + tl_y;
int cur_x2 = start_x + tl_x2;
int cur_y2 = start_y + tl_y2;
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
GENTYPE temp0,temp1;
__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int x = get_group_id(0)*LSIZE0;
int y = get_group_id(1)*LSIZE1;
int start_x = x+src_offset_x-RADIUSX;
int end_x = x + src_offset_x+LSIZE0+RADIUSX;
int width = end_x -start_x+1;
int start_y = y+src_offset_y-RADIUSY;
int point1 = mad24(l_y,LSIZE0,l_x);
int point2 = point1 + LSIZE0*LSIZE1;
int tl_x = point1 % width;
int tl_y = point1 / width;
int tl_x2 = point2 % width;
int tl_y2 = point2 / width;
int cur_x = start_x + tl_x;
int cur_y = start_y + tl_y;
int cur_x2 = start_x + tl_x2;
int cur_y2 = start_y + tl_y2;
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
GENTYPE temp0,temp1;
__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
temp0 = src[start_addr];
temp1 = src[start_addr2];
//judge if read out of boundary
temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
temp0 = src[start_addr];
temp1 = src[start_addr2];
//judge if read out of boundary
temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
LDS_DAT[point1] = temp0;
LDS_DAT[point2] = temp1;
barrier(CLK_LOCAL_MEM_FENCE);
GENTYPE res = (GENTYPE)VAL;
for(int i=0;i<2*RADIUSY+1;i++)
for(int j=0;j<2*RADIUSX+1;j++)
{
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
}
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx<cols && gidy<rows)
{
dst[out_addr] = res;
}
LDS_DAT[point1] = temp0;
LDS_DAT[point2] = temp1;
barrier(CLK_LOCAL_MEM_FENCE);
GENTYPE res = (GENTYPE)VAL;
for(int i=0;i<2*RADIUSY+1;i++)
for(int j=0;j<2*RADIUSX+1;j++)
{
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
}
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx<cols && gidy<rows)
{
dst[out_addr] = res;
}
}
#endif

View File

@@ -46,365 +46,365 @@ typedef float sqsumtype;
typedef struct __attribute__((aligned (128))) GpuHidHaarFeature
{
struct __attribute__((aligned (32)))
{
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float weight __attribute__((aligned (4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
struct __attribute__((aligned (32)))
{
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float weight __attribute__((aligned (4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
}
GpuHidHaarFeature;
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[2] __attribute__((aligned (8)));
int left __attribute__((aligned (4)));
int right __attribute__((aligned (4)));
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[2] __attribute__((aligned (8)));
int left __attribute__((aligned (4)));
int right __attribute__((aligned (4)));
}
GpuHidHaarTreeNode;
typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
{
int count __attribute__((aligned (4)));
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
float* alpha __attribute__((aligned (8)));
int count __attribute__((aligned (4)));
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
float* alpha __attribute__((aligned (8)));
}
GpuHidHaarClassifier;
typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
{
int count __attribute__((aligned (4)));
float threshold __attribute__((aligned (4)));
int two_rects __attribute__((aligned (4)));
int reserved0 __attribute__((aligned (8)));
int reserved1 __attribute__((aligned (8)));
int reserved2 __attribute__((aligned (8)));
int reserved3 __attribute__((aligned (8)));
int count __attribute__((aligned (4)));
float threshold __attribute__((aligned (4)));
int two_rects __attribute__((aligned (4)));
int reserved0 __attribute__((aligned (8)));
int reserved1 __attribute__((aligned (8)));
int reserved2 __attribute__((aligned (8)));
int reserved3 __attribute__((aligned (8)));
}
GpuHidHaarStageClassifier;
typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
{
int count __attribute__((aligned (4)));
int is_stump_based __attribute__((aligned (4)));
int has_tilted_features __attribute__((aligned (4)));
int is_tree __attribute__((aligned (4)));
int pq0 __attribute__((aligned (4)));
int pq1 __attribute__((aligned (4)));
int pq2 __attribute__((aligned (4)));
int pq3 __attribute__((aligned (4)));
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float inv_window_area __attribute__((aligned (4)));
int count __attribute__((aligned (4)));
int is_stump_based __attribute__((aligned (4)));
int has_tilted_features __attribute__((aligned (4)));
int is_tree __attribute__((aligned (4)));
int pq0 __attribute__((aligned (4)));
int pq1 __attribute__((aligned (4)));
int pq2 __attribute__((aligned (4)));
int pq3 __attribute__((aligned (4)));
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float inv_window_area __attribute__((aligned (4)));
}GpuHidHaarClassifierCascade;
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade,
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
global GpuHidHaarTreeNode * nodeptr,
global const int * restrict sum1,
global const float * restrict sqsum1,
global int4 * candidate,
const int pixelstep,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
const int splitnode,
const int4 p,
const int4 pq,
const float correction
//const int width,
//const int height,
//const int grpnumperline,
//const int totalgrp
)
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
global GpuHidHaarTreeNode * nodeptr,
global const int * restrict sum1,
global const float * restrict sqsum1,
global int4 * candidate,
const int pixelstep,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
const int splitnode,
const int4 p,
const int4 pq,
const float correction
//const int width,
//const int height,
//const int grpnumperline,
//const int totalgrp
)
{
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx = get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx = get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx,grpszy);
int lcl_id = mad24(lclidy,grpszx,lclidx);
int lcl_sz = mul24(grpszx,grpszy);
int lcl_id = mad24(lclidy,grpszx,lclidx);
//assume lcl_sz == 256 or 128 or 64
//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
__local int lclshare[1024];
//assume lcl_sz == 256 or 128 or 64
//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
__local int lclshare[1024];
#define OFF 0
__local int* lcldata = lclshare + OFF;//for save win data
__local int* glboutindex = lcldata + 28*28;//for save global out index
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
glboutindex[0]=0;
int outputoff = mul24(grpidx,256);
__local int* lcldata = lclshare + OFF;//for save win data
__local int* glboutindex = lcldata + 28*28;//for save global out index
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
glboutindex[0]=0;
int outputoff = mul24(grpidx,256);
//assume window size is 20X20
//assume window size is 20X20
#define WINDOWSIZE 20+1
//make sure readwidth is the multiple of 4
//ystep =1, from host code
int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
int readheight = grpszy-1+WINDOWSIZE;
int read_horiz_cnt = readwidth >> 2;//each read int4
int total_read = mul24(read_horiz_cnt,readheight);
int read_loop = (total_read + lcl_sz - 1) >> 6;
candidate[outputoff+(lcl_id<<2)] = (int4)0;
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
for(int scalei = 0; scalei <loopcount; scalei++)
{
int4 scaleinfo1= info[scalei];
int width = (scaleinfo1.x & 0xffff0000) >> 16;
int height = scaleinfo1.x & 0xffff;
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
int imgoff = scaleinfo1.z;
float factor = as_float(scaleinfo1.w);
//int ystep =1;// factor > 2.0 ? 1 : 2;
//make sure readwidth is the multiple of 4
//ystep =1, from host code
int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
int readheight = grpszy-1+WINDOWSIZE;
int read_horiz_cnt = readwidth >> 2;//each read int4
int total_read = mul24(read_horiz_cnt,readheight);
int read_loop = (total_read + lcl_sz - 1) >> 6;
candidate[outputoff+(lcl_id<<2)] = (int4)0;
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
for(int scalei = 0; scalei <loopcount; scalei++)
{
int4 scaleinfo1= info[scalei];
int width = (scaleinfo1.x & 0xffff0000) >> 16;
int height = scaleinfo1.x & 0xffff;
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
int imgoff = scaleinfo1.z;
float factor = as_float(scaleinfo1.w);
//int ystep =1;// factor > 2.0 ? 1 : 2;
__global const int * sum = sum1 + imgoff;
__global const float * sqsum = sqsum1 + imgoff;
for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
{
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int x = mad24(grpidx,grpszx,lclidx);
int y = mad24(grpidy,grpszy,lclidy);
//candidate_result.x = convert_int_rtn(x*factor);
//candidate_result.y = convert_int_rtn(y*factor);
int grpoffx = x-lclidx;
int grpoffy = y-lclidy;
__global const int * sum = sum1 + imgoff;
__global const float * sqsum = sqsum1 + imgoff;
for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
{
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int x = mad24(grpidx,grpszx,lclidx);
int y = mad24(grpidy,grpszy,lclidy);
//candidate_result.x = convert_int_rtn(x*factor);
//candidate_result.y = convert_int_rtn(y*factor);
int grpoffx = x-lclidx;
int grpoffy = y-lclidy;
for(int i=0;i<read_loop;i++)
{
int pos_id = mad24(i,lcl_sz,lcl_id);
pos_id = pos_id < total_read ? pos_id : 0;
for(int i=0;i<read_loop;i++)
{
int pos_id = mad24(i,lcl_sz,lcl_id);
pos_id = pos_id < total_read ? pos_id : 0;
int lcl_y = pos_id / read_horiz_cnt;
int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
int lcl_y = pos_id / read_horiz_cnt;
int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
int glb_x = grpoffx + (lcl_x<<2);
int glb_y = grpoffy + lcl_y;
int glb_x = grpoffx + (lcl_x<<2);
int glb_y = grpoffy + lcl_y;
int glb_off = mad24(glb_y,pixelstep,glb_x);
int4 data = *(__global int4*)&sum[glb_off];
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
int glb_off = mad24(glb_y,pixelstep,glb_x);
int4 data = *(__global int4*)&sum[glb_off];
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
lcldata[lcl_off] = data.x;
lcldata[lcl_off+1] = data.y;
lcldata[lcl_off+2] = data.z;
lcldata[lcl_off+3] = data.w;
}
lcldata[lcl_off] = data.x;
lcldata[lcl_off+1] = data.y;
lcldata[lcl_off+2] = data.z;
lcldata[lcl_off+3] = data.w;
}
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int result = 1;
int nodecounter= startnode;
float mean, variance_norm_factor;
barrier(CLK_LOCAL_MEM_FENCE);
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int result = 1;
int nodecounter= startnode;
float mean, variance_norm_factor;
barrier(CLK_LOCAL_MEM_FENCE);
int lcl_off = mad24(lclidy,readwidth,lclidx);
int4 cascadeinfo1, cascadeinfo2;
cascadeinfo1 = p;
cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
int lcl_off = mad24(lclidy,readwidth,lclidx);
int4 cascadeinfo1, cascadeinfo2;
cascadeinfo1 = p;
cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
//if((x < width) && (y < height))
{
cascadeinfo1.x +=lcl_off;
cascadeinfo1.z +=lcl_off;
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
*correction;
//if((x < width) && (y < height))
{
cascadeinfo1.x +=lcl_off;
cascadeinfo1.z +=lcl_off;
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
*correction;
int p_offset = mad24(y, pixelstep, x);
int p_offset = mad24(y, pixelstep, x);
cascadeinfo2.x +=p_offset;
cascadeinfo2.z +=p_offset;
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
cascadeinfo2.x +=p_offset;
cascadeinfo2.z +=p_offset;
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
variance_norm_factor = variance_norm_factor * correction - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
//if( cascade->is_stump_based )
//{
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
{
float stage_sum = 0.f;
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
variance_norm_factor = variance_norm_factor * correction - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
//if( cascade->is_stump_based )
//{
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
{
float stage_sum = 0.f;
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=lcl_off;
info1.z +=lcl_off;
info2.x +=lcl_off;
info2.z +=lcl_off;
info1.x +=lcl_off;
info1.z +=lcl_off;
info2.x +=lcl_off;
info2.z +=lcl_off;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
//if((info3.z - info3.x) && (!stageinfo.z))
//{
info3.x +=lcl_off;
info3.z +=lcl_off;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
//}
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
nodecounter++;
}
//if((info3.z - info3.x) && (!stageinfo.z))
//{
info3.x +=lcl_off;
info3.z +=lcl_off;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
//}
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
nodecounter++;
}
result = (stage_sum >= stagethreshold);
}
result = (stage_sum >= stagethreshold);
}
if(result && (x < width) && (y < height))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
nodecounter = splitnode;
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
{
//barrier(CLK_LOCAL_MEM_FENCE);
//if(lcl_id == 0)
if(result && (x < width) && (y < height))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
nodecounter = splitnode;
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
{
//barrier(CLK_LOCAL_MEM_FENCE);
//if(lcl_id == 0)
lclcount[0]=0;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
int perfscale = queuecount > 4 ? 3 : 2;
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
int lcl_compute_win = lcl_sz >> perfscale;
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
int perfscale = queuecount > 4 ? 3 : 2;
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
int lcl_compute_win = lcl_sz >> perfscale;
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
//barrier(CLK_LOCAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount) {
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
float part_sum = 0.f;
for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_pixel;
info1.z +=queue_pixel;
info2.x +=queue_pixel;
info2.z +=queue_pixel;
info1.x +=queue_pixel;
info1.z +=queue_pixel;
info2.x +=queue_pixel;
info2.z +=queue_pixel;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
//if((info3.z - info3.x) && (!stageinfo.z))
//{
info3.x +=queue_pixel;
info3.z +=queue_pixel;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
//}
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter +=lcl_compute_win;
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
//if((info3.z - info3.x) && (!stageinfo.z))
//{
info3.x +=queue_pixel;
info3.z +=queue_pixel;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
//}
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter +=lcl_compute_win;
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount) {
for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
{
stage_sum += partialsum[lcl_id+i];
}
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = temp_coord;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
lcl_compute_win_id +=(1<<perfscale);
for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
{
stage_sum += partialsum[lcl_id+i];
}
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = temp_coord;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
lcl_compute_win_id +=(1<<perfscale);
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
barrier(CLK_LOCAL_MEM_FENCE);
queuecount = lclcount[0];
nodecounter += stageinfo.x;
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_id<queuecount)
{
int temp = lcloutindex[lcl_id<<1];
int x = mad24(grpidx,grpszx,temp & 0xffff);
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
temp = glboutindex[0];
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
candidate_result.x = convert_int_rtn(x*factor);
candidate_result.y = convert_int_rtn(y*factor);
atomic_inc(glboutindex);
candidate[outputoff+temp+lcl_id] = candidate_result;
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end if((x < width) && (y < height))
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
//outputoff +=mul24(width,height);
}//end for(int scalei = 0; scalei <loopcount; scalei++)
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
barrier(CLK_LOCAL_MEM_FENCE);
queuecount = lclcount[0];
nodecounter += stageinfo.x;
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_id<queuecount)
{
int temp = lcloutindex[lcl_id<<1];
int x = mad24(grpidx,grpszx,temp & 0xffff);
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
temp = glboutindex[0];
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
candidate_result.x = convert_int_rtn(x*factor);
candidate_result.y = convert_int_rtn(y*factor);
atomic_inc(glboutindex);
candidate[outputoff+temp+lcl_id] = candidate_result;
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end if((x < width) && (y < height))
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
//outputoff +=mul24(width,height);
}//end for(int scalei = 0; scalei <loopcount; scalei++)
}
@@ -421,7 +421,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
/*
if(stagecascade->two_rects)
if(stagecascade->two_rects)
{
#pragma unroll
for( n = 0; n < stagecascade->count; n++ )
@@ -429,10 +429,10 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
t1 = *(node + counter);
t = t1.threshold * variance_norm_factor;
classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
classsum += calc_sum1(t1, p_offset,1) * t1.weight[1];
stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
counter++;
}
}
@@ -444,75 +444,75 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
t = node[counter].threshold*variance_norm_factor;
classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
if( node[counter].p0[2] )
classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
counter++;
}
}
*/
/*
/*
__kernel void gpuRunHaarClassifierCascade_ScaleWindow(
constant GpuHidHaarClassifierCascade * _cascade,
global GpuHidHaarStageClassifier * stagecascadeptr,
//global GpuHidHaarClassifier * classifierptr,
global GpuHidHaarTreeNode * nodeptr,
global int * sum,
global float * sqsum,
global int * _candidate,
constant GpuHidHaarClassifierCascade * _cascade,
global GpuHidHaarStageClassifier * stagecascadeptr,
//global GpuHidHaarClassifier * classifierptr,
global GpuHidHaarTreeNode * nodeptr,
global int * sum,
global float * sqsum,
global int * _candidate,
int pixel_step,
int cols,
int rows,
int start_stage,
int end_stage,
int cols,
int rows,
int start_stage,
int end_stage,
//int counts,
int nodenum,
int ystep,
int detect_width,
//int detect_height,
int loopcount,
int outputstep)
//float scalefactor)
int nodenum,
int ystep,
int detect_width,
//int detect_height,
int loopcount,
int outputstep)
//float scalefactor)
{
unsigned int x1 = get_global_id(0);
unsigned int y1 = get_global_id(1);
int p_offset;
int m, n;
int result;
int counter;
float mean, variance_norm_factor;
for(int i=0;i<loopcount;i++)
{
constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
global int * candidate = _candidate + i*outputstep;
int window_width = cascade->p1 - cascade->p0;
int window_height = window_width;
result = 1;
counter = 0;
unsigned int x = mul24(x1,ystep);
unsigned int y = mul24(y1,ystep);
if((x < cols - window_width - 1) && (y < rows - window_height -1))
{
global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
//global GpuHidHaarClassifier *classifier = classifierptr;
global GpuHidHaarTreeNode *node = nodeptr + nodenum*i;
unsigned int x1 = get_global_id(0);
unsigned int y1 = get_global_id(1);
int p_offset;
int m, n;
int result;
int counter;
float mean, variance_norm_factor;
for(int i=0;i<loopcount;i++)
{
constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
global int * candidate = _candidate + i*outputstep;
int window_width = cascade->p1 - cascade->p0;
int window_height = window_width;
result = 1;
counter = 0;
unsigned int x = mul24(x1,ystep);
unsigned int y = mul24(y1,ystep);
if((x < cols - window_width - 1) && (y < rows - window_height -1))
{
global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
//global GpuHidHaarClassifier *classifier = classifierptr;
global GpuHidHaarTreeNode *node = nodeptr + nodenum*i;
p_offset = mad24(y, pixel_step, x);// modify
p_offset = mad24(y, pixel_step, x);// modify
mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
*(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
*cascade->inv_window_area;
mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
*(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
*cascade->inv_window_area;
variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
*(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
*(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
// if( cascade->is_stump_based )
//{
// if( cascade->is_stump_based )
//{
for( m = start_stage; m < end_stage; m++ )
{
float stage_sum = 0.f;
@@ -532,29 +532,29 @@ __kernel void gpuRunHaarClassifierCascade_ScaleWindow(
stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
counter++;
}
if (stage_sum < stagecascade->threshold)
{
result = 0;
break;
result = 0;
break;
}
stagecascade++;
}
if(result)
{
candidate[4 * (y1 * detect_width + x1)] = x;
candidate[4 * (y1 * detect_width + x1) + 1] = y;
candidate[4 * (y1 * detect_width + x1)+2] = window_width;
candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
}
//}
}
}
if(result)
{
candidate[4 * (y1 * detect_width + x1)] = x;
candidate[4 * (y1 * detect_width + x1) + 1] = y;
candidate[4 * (y1 * detect_width + x1)+2] = window_width;
candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
}
//}
}
}
}
*/

View File

@@ -50,89 +50,89 @@ typedef int sumtype;
typedef float sqsumtype;
typedef struct __attribute__((aligned (128))) GpuHidHaarFeature
{
struct __attribute__((aligned (32)))
{
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float weight __attribute__((aligned (4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
struct __attribute__((aligned (32)))
{
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float weight __attribute__((aligned (4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
}
GpuHidHaarFeature;
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[2] __attribute__((aligned (8)));
int left __attribute__((aligned (4)));
int right __attribute__((aligned (4)));
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[2] __attribute__((aligned (8)));
int left __attribute__((aligned (4)));
int right __attribute__((aligned (4)));
}
GpuHidHaarTreeNode;
typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
{
int count __attribute__((aligned (4)));
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
float* alpha __attribute__((aligned (8)));
int count __attribute__((aligned (4)));
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
float* alpha __attribute__((aligned (8)));
}
GpuHidHaarClassifier;
typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
{
int count __attribute__((aligned (4)));
float threshold __attribute__((aligned (4)));
int two_rects __attribute__((aligned (4)));
int reserved0 __attribute__((aligned (8)));
int reserved1 __attribute__((aligned (8)));
int reserved2 __attribute__((aligned (8)));
int reserved3 __attribute__((aligned (8)));
int count __attribute__((aligned (4)));
float threshold __attribute__((aligned (4)));
int two_rects __attribute__((aligned (4)));
int reserved0 __attribute__((aligned (8)));
int reserved1 __attribute__((aligned (8)));
int reserved2 __attribute__((aligned (8)));
int reserved3 __attribute__((aligned (8)));
}
GpuHidHaarStageClassifier;
typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
{
int count __attribute__((aligned (4)));
int is_stump_based __attribute__((aligned (4)));
int has_tilted_features __attribute__((aligned (4)));
int is_tree __attribute__((aligned (4)));
int pq0 __attribute__((aligned (4)));
int pq1 __attribute__((aligned (4)));
int pq2 __attribute__((aligned (4)));
int pq3 __attribute__((aligned (4)));
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float inv_window_area __attribute__((aligned (4)));
int count __attribute__((aligned (4)));
int is_stump_based __attribute__((aligned (4)));
int has_tilted_features __attribute__((aligned (4)));
int is_tree __attribute__((aligned (4)));
int pq0 __attribute__((aligned (4)));
int pq1 __attribute__((aligned (4)));
int pq2 __attribute__((aligned (4)));
int pq3 __attribute__((aligned (4)));
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float inv_window_area __attribute__((aligned (4)));
}GpuHidHaarClassifierCascade;
__kernel void gpuRunHaarClassifierCascade_scaled2(
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
global GpuHidHaarTreeNode * nodeptr,
global const int * restrict sum,
global const float * restrict sqsum,
global int4 * candidate,
const int step,
const int loopcount,
const int start_stage,
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
global GpuHidHaarTreeNode * nodeptr,
global const int * restrict sum,
global const float * restrict sqsum,
global int4 * candidate,
const int step,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
const int end_stage,
const int startnode,
const int splitnode,
global int4 * p,
//const int4 * pq,
global float * correction,
//const int4 * pq,
global float * correction,
const int nodecount)
{
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx=get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx,grpszy);
int lcl_id = mad24(lclidy,grpszx,lclidx);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx,grpszy);
int lcl_id = mad24(lclidy,grpszx,lclidx);
__local int lclshare[1024];
__local int* glboutindex=lclshare+0;
__local int* lclcount=glboutindex+1;
@@ -140,85 +140,85 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
__local float* partialsum=(__local float*)(lcloutindex+(lcl_sz<<1));
glboutindex[0]=0;
int outputoff = mul24(grpidx,256);
candidate[outputoff+(lcl_id<<2)] = (int4)0;
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
for(int scalei = 0; scalei <loopcount; scalei++)
{
int4 scaleinfo1;
scaleinfo1 = info[scalei];
int width = (scaleinfo1.x & 0xffff0000) >> 16;
int height = scaleinfo1.x & 0xffff;
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
float factor = as_float(scaleinfo1.w);
float correction_t=correction[scalei];
int ystep=(int)(max(2.0f,factor)+0.5f);
for(int grploop=get_group_id(0);grploop<totalgrp;grploop+=grpnumx){
int4 cascadeinfo=p[scalei];
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int ix = mad24(grpidx,grpszx,lclidx);
int iy = mad24(grpidy,grpszy,lclidy);
candidate[outputoff+(lcl_id<<2)] = (int4)0;
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
for(int scalei = 0; scalei <loopcount; scalei++)
{
int4 scaleinfo1;
scaleinfo1 = info[scalei];
int width = (scaleinfo1.x & 0xffff0000) >> 16;
int height = scaleinfo1.x & 0xffff;
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
float factor = as_float(scaleinfo1.w);
float correction_t=correction[scalei];
int ystep=(int)(max(2.0f,factor)+0.5f);
for(int grploop=get_group_id(0);grploop<totalgrp;grploop+=grpnumx){
int4 cascadeinfo=p[scalei];
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int ix = mad24(grpidx,grpszx,lclidx);
int iy = mad24(grpidy,grpszy,lclidy);
int x=ix*ystep;
int y=iy*ystep;
lcloutindex[lcl_id]=0;
lclcount[0]=0;
int result=1,nodecounter;
float mean,variance_norm_factor;
//if((ix < width) && (iy < height))
int result=1,nodecounter;
float mean,variance_norm_factor;
//if((ix < width) && (iy < height))
{
const int p_offset = mad24(y, step, x);
cascadeinfo.x +=p_offset;
cascadeinfo.z +=p_offset;
mean = (sum[mad24(cascadeinfo.y,step,cascadeinfo.x)] - sum[mad24(cascadeinfo.y,step,cascadeinfo.z)] -
sum[mad24(cascadeinfo.w,step,cascadeinfo.x)] + sum[mad24(cascadeinfo.w,step,cascadeinfo.z)])
*correction_t;
variance_norm_factor =sqsum[mad24(cascadeinfo.y,step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
result = 1;
nodecounter = startnode+nodecount*scalei;
for(int stageloop = start_stage; stageloop < split_stage&&result; stageloop++ )
{
float stage_sum = 0.f;
int4 stageinfo = *(global int4*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
const int p_offset = mad24(y, step, x);
cascadeinfo.x +=p_offset;
cascadeinfo.z +=p_offset;
mean = (sum[mad24(cascadeinfo.y,step,cascadeinfo.x)] - sum[mad24(cascadeinfo.y,step,cascadeinfo.z)] -
sum[mad24(cascadeinfo.w,step,cascadeinfo.x)] + sum[mad24(cascadeinfo.w,step,cascadeinfo.z)])
*correction_t;
variance_norm_factor =sqsum[mad24(cascadeinfo.y,step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
result = 1;
nodecounter = startnode+nodecount*scalei;
for(int stageloop = start_stage; stageloop < split_stage&&result; stageloop++ )
{
float stage_sum = 0.f;
int4 stageinfo = *(global int4*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=p_offset;
info1.z +=p_offset;
info2.x +=p_offset;
info2.z +=p_offset;
float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
info3.x +=p_offset;
info3.z +=p_offset;
classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
nodecounter++;
}
result=(stage_sum>=stagethreshold);
}
if(result&&(ix<width)&&(iy<height))
info1.x +=p_offset;
info1.z +=p_offset;
info2.x +=p_offset;
info2.z +=p_offset;
float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
info3.x +=p_offset;
info3.z +=p_offset;
classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
nodecounter++;
}
result=(stage_sum>=stagethreshold);
}
if(result&&(ix<width)&&(iy<height))
{
int queueindex=atomic_inc(lclcount);
lcloutindex[queueindex<<1]=(y<<16)|x;
lcloutindex[(queueindex<<1)+1]=as_int(variance_norm_factor);
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount=lclcount[0];
nodecounter=splitnode+nodecount*scalei;
for(int stageloop=split_stage;stageloop<end_stage&&queuecount>0;stageloop++)
@@ -244,34 +244,34 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
for(int lcl_loop=0;lcl_loop<lcl_loops&&tempnodecounter<stageinfo.x;lcl_loop++)
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_offset;
info1.z +=queue_offset;
info2.x +=queue_offset;
info2.z +=queue_offset;
float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
info3.x +=queue_offset;
info3.z +=queue_offset;
classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter+=lcl_compute_win;
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_offset;
info1.z +=queue_offset;
info2.x +=queue_offset;
info2.z +=queue_offset;
float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
info3.x +=queue_offset;
info3.z +=queue_offset;
classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter+=lcl_compute_win;
}
partialsum[lcl_id]=part_sum;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
for(int i=0;i<lcl_compute_win&&(lcl_compute_id==0);i++)
{
stage_sum+=partialsum[lcl_id+i];
}
}
if(stage_sum>=stagethreshold&&(lcl_compute_id==0))
{
int queueindex=atomic_inc(lclcount);
@@ -298,8 +298,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
candidate[outputoff+temp+lcl_id]=candidate_result;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
}
}
__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode * orinode, global GpuHidHaarTreeNode * newnode,float scale,float weight_scale,int nodenum)

View File

@@ -33,106 +33,106 @@
// the use of this software, even if advised of the possibility of such damage.
__kernel void bilateral_C1_D0(__global uchar *dst,
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float sum = 0.f, wsum = 0.f;
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float sum = 0.f, wsum = 0.f;
int val0 = (int)src[src_addr];
for(int k = 0; k < maxk; k++ )
{
int val = (int)src[src_addr + space_ofs[k]];
float w = space_weight[k]*color_weight[abs(val - val0)];
sum += (float)(val)*w;
wsum += w;
}
dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
}
int val0 = (int)src[src_addr];
for(int k = 0; k < maxk; k++ )
{
int val = (int)src[src_addr + space_ofs[k]];
float w = space_weight[k]*color_weight[abs(val - val0)];
sum += (float)(val)*w;
wsum += w;
}
dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
}
}
__kernel void bilateral2_C1_D0(__global uchar *dst,
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float4 sum = (float4)(0.f), wsum = (float4)(0.f);
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float4 sum = (float4)(0.f), wsum = (float4)(0.f);
int4 val0 = convert_int4(vload4(0,src+src_addr));
for(int k = 0; k < maxk; k++ )
{
int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
sum += convert_float4(val)*w;
wsum += w;
}
*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
}
int4 val0 = convert_int4(vload4(0,src+src_addr));
for(int k = 0; k < maxk; k++ )
{
int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
sum += convert_float4(val)*w;
wsum += w;
}
*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
}
}
__kernel void bilateral_C4_D0(__global uchar4 *dst,
__global const uchar4 *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float4 sum = (float4)0.f;
float wsum = 0.f;
__global const uchar4 *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float4 sum = (float4)0.f;
float wsum = 0.f;
int4 val0 = convert_int4(src[src_addr]);
for(int k = 0; k < maxk; k++ )
{
int4 val = convert_int4(src[src_addr + space_ofs[k]]);
float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
sum += convert_float4(val)*(float4)w;
wsum += w;
}
wsum=1.f/wsum;
dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
}
int4 val0 = convert_int4(src[src_addr]);
for(int k = 0; k < maxk; k++ )
{
int4 val = convert_int4(src[src_addr + space_ofs[k]]);
float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
sum += convert_float4(val)*(float4)w;
wsum += w;
}
wsum=1.f/wsum;
dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
}
}

View File

@@ -53,8 +53,8 @@
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
@@ -120,10 +120,10 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
for(int i=0; i < ksY+1; i++)
{
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_data[i] = dx_con ? dx_s : 0.0;
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_data[i] = dy_con ? dy_s : 0.0;
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
@@ -139,7 +139,7 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
int dy_selected_row;
int dy_selected_col;
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
@@ -147,7 +147,7 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
data[2][i] = dy_data[i] * dy_data[i];
@@ -189,12 +189,12 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
if(posX < dst_cols && (posY) < dst_rows)
{
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
}
if(posX < dst_cols && (posY + 1) < dst_rows)
{
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
}
}

View File

@@ -53,8 +53,8 @@
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#endif
#ifdef BORDER_REFLECT
@@ -120,10 +120,10 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
for(int i=0; i < ksY+1; i++)
{
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_data[i] = dx_con ? dx_s : 0.0;
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_data[i] = dy_con ? dy_s : 0.0;
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
@@ -139,7 +139,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
int dy_selected_row;
int dy_selected_col;
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
@@ -147,7 +147,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
data[2][i] = dy_data[i] * dy_data[i];

View File

@@ -56,19 +56,19 @@ inline float calc(int x, int y)
{
return (float)abs(x) + abs(y);
}
#endif //
#endif //
// Smoothing perpendicular to the derivative direction with a triangle filter
// only support 3x3 Sobel kernel
// only support 3x3 Sobel kernel
// h (-1) = 1, h (0) = 2, h (1) = 1
// h'(-1) = -1, h'(0) = 0, h'(1) = 1
// thus sobel 2D operator can be calculated as:
// h'(x, y) = h'(x)h(y) for x direction
//
//
// src input 8bit single channel image data
// dx_buf output dx buffer
// dy_buf output dy buffer
__kernel
__kernel
void calcSobelRowPass
(
__global const uchar * src,
@@ -99,11 +99,11 @@ __kernel
__local int smem[16][18];
smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset];
smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset];
if(lidx == 0)
{
smem[lidy][0] = src[max(gidx - 1, 0) + gidy * src_step + src_offset];
smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset];
smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -122,7 +122,7 @@ __kernel
// calculate the magnitude of the filter pass combining both x and y directions
// This is the buffered version(3x3 sobel)
//
//
// dx_buf dx buffer, calculated from calcSobelRowPass
// dy_buf dy buffer, calculated from calcSobelRowPass
// dx direvitive in x direction output
@@ -169,7 +169,7 @@ __kernel
__local int sdx[18][16];
__local int sdy[18][16];
sdx[lidy + 1][lidx] = dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset];
sdy[lidy + 1][lidx] = dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset];
if(lidy == 0)
@@ -199,7 +199,7 @@ __kernel
// calculate the magnitude of the filter pass combining both x and y directions
// This is the non-buffered version(non-3x3 sobel)
//
//
// dx_buf dx buffer, calculated from calcSobelRowPass
// dy_buf dy buffer, calculated from calcSobelRowPass
// dx direvitive in x direction output
@@ -233,9 +233,9 @@ __kernel
if(gidy < rows && gidx < cols)
{
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
calc(
dx[gidx + gidy * dx_step + dx_offset],
dx[gidx + gidy * dx_step + dx_offset],
dy[gidx + gidy * dy_step + dy_offset]
);
}
@@ -251,7 +251,7 @@ __kernel
// 0 - below low thres, not an edge
// 1 - maybe an edge
// 2 - is an edge, either magnitude is greater than high thres, or
// Given estimates of the image gradients, a search is then carried out
// Given estimates of the image gradients, a search is then carried out
// to determine if the gradient magnitude assumes a local maximum in the gradient direction.
// if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
// if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
@@ -265,7 +265,7 @@ __kernel
void calcMap
(
__global const int * dx,
__global const int * dy,
__global const int * dy,
__global const float * mag,
__global int * map,
int rows,
@@ -362,10 +362,10 @@ __kernel
// non local memory version
__kernel
void calcMap_2
void calcMap_2
(
__global const int * dx,
__global const int * dy,
__global const int * dy,
__global const float * mag,
__global int * map,
int rows,
@@ -444,7 +444,7 @@ __kernel
void calcMap_3
(
__global const int * dx,
__global const int * dy,
__global const int * dy,
__global const float * mag,
__global int * map,
int rows,
@@ -550,9 +550,9 @@ __kernel
//
// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
// marked as edge. Each thread will iterate for 16 times to connect local edges.
// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
//
//
// map raw edge type results calculated from calcMap.
// st the potiential edge points found in this kernel call
// counter the number of potiential edge points
@@ -560,7 +560,7 @@ __kernel
void edgesHysteresisLocal
(
__global int * map,
__global ushort2 * st,
__global ushort2 * st,
volatile __global unsigned int * counter,
int rows,
int cols,
@@ -657,8 +657,8 @@ __kernel
void edgesHysteresisGlobal
(
__global int * map,
__global ushort2 * st1,
__global ushort2 * st2,
__global ushort2 * st1,
__global ushort2 * st2,
volatile __global int * counter,
int rows,
int cols,

View File

@@ -57,24 +57,24 @@
/// CV_32FC1
__kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
{
const int x = get_global_id(0);
srcStep >>= 2;
dstStep >>= 2;
const int x = get_global_id(0);
if (x < srcCols)
srcStep >>= 2;
dstStep >>= 2;
if (x < srcCols)
{
int srcIdx = x ;
int dstIdx = x ;
int srcIdx = x ;
int dstIdx = x ;
float sum = 0;
for (int y = 0; y < srcRows; ++y)
{
sum += src[srcIdx];
sum += src[srcIdx];
dst[dstIdx] = sum;
srcIdx += srcStep;
dstIdx += dstStep;
srcIdx += srcStep;
dstIdx += dstStep;
}
}
}
}

View File

@@ -53,7 +53,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight)
{
__local float smem[16 + 2 * 8][16 + 2 * 8];
int x = get_local_id(0);
int y = get_local_id(1);
int gx = get_global_id(0);
@@ -92,7 +92,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
barrier(CLK_LOCAL_MEM_FENCE);
if (gx < cols && gy < rows)
{
float res = 0;

View File

@@ -65,136 +65,136 @@
#endif
__kernel void copymakeborder
(__global const GENTYPE *src,
__global GENTYPE *dst,
(__global const GENTYPE *src,
__global GENTYPE *dst,
const int dst_cols,
const int dst_rows,
const int src_cols,
const int src_rows,
const int src_step_in_pixel,
const int src_offset_in_pixel,
const int dst_step_in_pixel,
const int dst_offset_in_pixel,
const int dst_rows,
const int src_cols,
const int src_rows,
const int src_step_in_pixel,
const int src_offset_in_pixel,
const int dst_step_in_pixel,
const int dst_offset_in_pixel,
const int top,
const int left,
const GENTYPE val
const int left,
const GENTYPE val
)
{
int x = get_global_id(0);
int y = get_global_id(1);
int src_x = x-left;
int src_y = y-top;
int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
int con = (src_x >= 0) && (src_x < src_cols) && (src_y >= 0) && (src_y < src_rows);
if(con)
{
dst[dst_addr] = src[src_addr];
}
else
{
#ifdef BORDER_CONSTANT
//write the result to dst
if((x<dst_cols) && (y<dst_rows))
{
dst[dst_addr] = val;
}
#else
int s_x,s_y;
//judge if read out of boundary
s_x= ADDR_L(src_x,0,src_cols,src_x);
s_x= ADDR_R(src_x,src_cols,s_x);
s_y= ADDR_L(src_y,0,src_rows,src_y);
s_y= ADDR_R(src_y,src_rows,s_y);
src_addr=mad24(s_y,src_step_in_pixel,s_x+src_offset_in_pixel);
//write the result to dst
if((x<dst_cols) && (y<dst_rows))
{
dst[dst_addr] = src[src_addr];
}
#endif
}
int x = get_global_id(0);
int y = get_global_id(1);
int src_x = x-left;
int src_y = y-top;
int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
int con = (src_x >= 0) && (src_x < src_cols) && (src_y >= 0) && (src_y < src_rows);
if(con)
{
dst[dst_addr] = src[src_addr];
}
else
{
#ifdef BORDER_CONSTANT
//write the result to dst
if((x<dst_cols) && (y<dst_rows))
{
dst[dst_addr] = val;
}
#else
int s_x,s_y;
//judge if read out of boundary
s_x= ADDR_L(src_x,0,src_cols,src_x);
s_x= ADDR_R(src_x,src_cols,s_x);
s_y= ADDR_L(src_y,0,src_rows,src_y);
s_y= ADDR_R(src_y,src_rows,s_y);
src_addr=mad24(s_y,src_step_in_pixel,s_x+src_offset_in_pixel);
//write the result to dst
if((x<dst_cols) && (y<dst_rows))
{
dst[dst_addr] = src[src_addr];
}
#endif
}
}
__kernel void copymakeborder_C1_D0
(__global const uchar *src,
__global uchar *dst,
(__global const uchar *src,
__global uchar *dst,
const int dst_cols,
const int dst_rows,
const int src_cols,
const int src_rows,
const int src_step_in_pixel,
const int src_offset_in_pixel,
const int dst_step_in_pixel,
const int dst_offset_in_pixel,
const int dst_rows,
const int src_cols,
const int src_rows,
const int src_step_in_pixel,
const int src_offset_in_pixel,
const int dst_step_in_pixel,
const int dst_offset_in_pixel,
const int top,
const int left,
const uchar val
const int left,
const uchar val
)
{
int x = get_global_id(0)<<2;
int y = get_global_id(1);
int src_x = x-left;
int src_y = y-top;
int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
int con = (src_x >= 0) && (src_x+3 < src_cols) && (src_y >= 0) && (src_y < src_rows);
if(con)
{
uchar4 tmp = vload4(0,src+src_addr);
*(__global uchar4*)(dst+dst_addr) = tmp;
}
else
{
#ifdef BORDER_CONSTANT
//write the result to dst
if((((src_x<0) && (src_x+3>=0))||(src_x < src_cols) && (src_x+3 >= src_cols)) && (src_y >= 0) && (src_y < src_rows))
{
int4 addr;
uchar4 tmp;
addr.x = ((src_x < 0) || (src_x>= src_cols)) ? 0 : src_addr;
addr.y = ((src_x+1 < 0) || (src_x+1>= src_cols)) ? 0 : (src_addr+1);
addr.z = ((src_x+2 < 0) || (src_x+2>= src_cols)) ? 0 : (src_addr+2);
addr.w = ((src_x+3 < 0) || (src_x+3>= src_cols)) ? 0 : (src_addr+3);
tmp.x = src[addr.x];
tmp.y = src[addr.y];
tmp.z = src[addr.z];
tmp.w = src[addr.w];
tmp.x = (src_x >=0)&&(src_x < src_cols) ? tmp.x : val;
tmp.y = (src_x+1 >=0)&&(src_x +1 < src_cols) ? tmp.y : val;
tmp.z = (src_x+2 >=0)&&(src_x +2 < src_cols) ? tmp.z : val;
tmp.w = (src_x+3 >=0)&&(src_x +3 < src_cols) ? tmp.w : val;
*(__global uchar4*)(dst+dst_addr) = tmp;
}
else if((x<dst_cols) && (y<dst_rows))
{
*(__global uchar4*)(dst+dst_addr) = (uchar4)val;
}
#else
int4 s_x;
int s_y;
//judge if read out of boundary
s_x.x= ADDR_L(src_x,0,src_cols,src_x);
s_x.y= ADDR_L(src_x+1,0,src_cols,src_x+1);
s_x.z= ADDR_L(src_x+2,0,src_cols,src_x+2);
s_x.w= ADDR_L(src_x+3,0,src_cols,src_x+3);
s_x.x= ADDR_R(src_x,src_cols,s_x.x);
s_x.y= ADDR_R(src_x+1,src_cols,s_x.y);
s_x.z= ADDR_R(src_x+2,src_cols,s_x.z);
s_x.w= ADDR_R(src_x+3,src_cols,s_x.w);
s_y= ADDR_L(src_y,0,src_rows,src_y);
s_y= ADDR_R(src_y,src_rows,s_y);
int4 src_addr4=mad24((int4)s_y,(int4)src_step_in_pixel,s_x+(int4)src_offset_in_pixel);
//write the result to dst
if((x<dst_cols) && (y<dst_rows))
{
uchar4 tmp;
tmp.x = src[src_addr4.x];
tmp.y = src[src_addr4.y];
tmp.z = src[src_addr4.z];
tmp.w = src[src_addr4.w];
*(__global uchar4*)(dst+dst_addr) = tmp;
}
#endif
}
int x = get_global_id(0)<<2;
int y = get_global_id(1);
int src_x = x-left;
int src_y = y-top;
int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
int con = (src_x >= 0) && (src_x+3 < src_cols) && (src_y >= 0) && (src_y < src_rows);
if(con)
{
uchar4 tmp = vload4(0,src+src_addr);
*(__global uchar4*)(dst+dst_addr) = tmp;
}
else
{
#ifdef BORDER_CONSTANT
//write the result to dst
if((((src_x<0) && (src_x+3>=0))||(src_x < src_cols) && (src_x+3 >= src_cols)) && (src_y >= 0) && (src_y < src_rows))
{
int4 addr;
uchar4 tmp;
addr.x = ((src_x < 0) || (src_x>= src_cols)) ? 0 : src_addr;
addr.y = ((src_x+1 < 0) || (src_x+1>= src_cols)) ? 0 : (src_addr+1);
addr.z = ((src_x+2 < 0) || (src_x+2>= src_cols)) ? 0 : (src_addr+2);
addr.w = ((src_x+3 < 0) || (src_x+3>= src_cols)) ? 0 : (src_addr+3);
tmp.x = src[addr.x];
tmp.y = src[addr.y];
tmp.z = src[addr.z];
tmp.w = src[addr.w];
tmp.x = (src_x >=0)&&(src_x < src_cols) ? tmp.x : val;
tmp.y = (src_x+1 >=0)&&(src_x +1 < src_cols) ? tmp.y : val;
tmp.z = (src_x+2 >=0)&&(src_x +2 < src_cols) ? tmp.z : val;
tmp.w = (src_x+3 >=0)&&(src_x +3 < src_cols) ? tmp.w : val;
*(__global uchar4*)(dst+dst_addr) = tmp;
}
else if((x<dst_cols) && (y<dst_rows))
{
*(__global uchar4*)(dst+dst_addr) = (uchar4)val;
}
#else
int4 s_x;
int s_y;
//judge if read out of boundary
s_x.x= ADDR_L(src_x,0,src_cols,src_x);
s_x.y= ADDR_L(src_x+1,0,src_cols,src_x+1);
s_x.z= ADDR_L(src_x+2,0,src_cols,src_x+2);
s_x.w= ADDR_L(src_x+3,0,src_cols,src_x+3);
s_x.x= ADDR_R(src_x,src_cols,s_x.x);
s_x.y= ADDR_R(src_x+1,src_cols,s_x.y);
s_x.z= ADDR_R(src_x+2,src_cols,s_x.z);
s_x.w= ADDR_R(src_x+3,src_cols,s_x.w);
s_y= ADDR_L(src_y,0,src_rows,src_y);
s_y= ADDR_R(src_y,src_rows,s_y);
int4 src_addr4=mad24((int4)s_y,(int4)src_step_in_pixel,s_x+(int4)src_offset_in_pixel);
//write the result to dst
if((x<dst_cols) && (y<dst_rows))
{
uchar4 tmp;
tmp.x = src[src_addr4.x];
tmp.y = src[src_addr4.y];
tmp.z = src[src_addr4.z];
tmp.w = src[src_addr4.w];
*(__global uchar4*)(dst+dst_addr) = tmp;
}
#endif
}
}

View File

@@ -34,7 +34,7 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
#define PARTIAL_HISTOGRAM256_COUNT (256)
#define PARTIAL_HISTOGRAM256_COUNT (256)
#define HISTOGRAM256_BIN_COUNT (256)
#define HISTOGRAM256_WORK_GROUP_SIZE (256)
@@ -45,12 +45,12 @@
__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
__global const uint4* src,
int src_step, int src_offset,
__global const uint4* src,
int src_step, int src_offset,
__global int* globalHist,
int dataCount, int cols,
int inc_x, int inc_y,
int hist_step)
int dataCount, int cols,
int inc_x, int inc_y,
int hist_step)
{
__local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
int gid = get_global_id(0);
@@ -63,7 +63,7 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
int offset = (lid & (NBANKS-1));// lid % NBANKS
uint4 data, temp1, temp2, temp3, temp4;
src += src_offset;
//clear LDS
for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
{
@@ -73,7 +73,7 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
subhist[idx+=lsize] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
//read and scatter
int y = gid/cols;
int x = gid - mul24(y, cols);
@@ -87,35 +87,35 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
temp3 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp4 = ((data & mask) << NBANKS_BIT) + offset;
atomic_inc(subhist + temp1.x);
atomic_inc(subhist + temp1.y);
atomic_inc(subhist + temp1.z);
atomic_inc(subhist + temp1.w);
atomic_inc(subhist + temp2.x);
atomic_inc(subhist + temp2.y);
atomic_inc(subhist + temp2.z);
atomic_inc(subhist + temp2.w);
atomic_inc(subhist + temp3.x);
atomic_inc(subhist + temp3.y);
atomic_inc(subhist + temp3.z);
atomic_inc(subhist + temp3.w);
atomic_inc(subhist + temp4.x);
atomic_inc(subhist + temp4.y);
atomic_inc(subhist + temp4.z);
atomic_inc(subhist + temp1.x);
atomic_inc(subhist + temp1.y);
atomic_inc(subhist + temp1.z);
atomic_inc(subhist + temp1.w);
atomic_inc(subhist + temp2.x);
atomic_inc(subhist + temp2.y);
atomic_inc(subhist + temp2.z);
atomic_inc(subhist + temp2.w);
atomic_inc(subhist + temp3.x);
atomic_inc(subhist + temp3.y);
atomic_inc(subhist + temp3.z);
atomic_inc(subhist + temp3.w);
atomic_inc(subhist + temp4.x);
atomic_inc(subhist + temp4.y);
atomic_inc(subhist + temp4.z);
atomic_inc(subhist + temp4.w);
x += inc_x;
int off = ((x>=cols) ? -1 : 0);
x = mad24(off, cols, x);
y += inc_y - off;
}
barrier(CLK_LOCAL_MEM_FENCE);
//reduce local banks to single histogram per workgroup
//reduce local banks to single histogram per workgroup
int bin1=0, bin2=0, bin3=0, bin4=0;
for(int i=0; i<NBANKS; i+=4)
{
@@ -124,19 +124,19 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
bin3 += subhist[(lid << NBANKS_BIT) + i+2];
bin4 += subhist[(lid << NBANKS_BIT) + i+3];
}
globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
}
__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))calc_sub_hist_border_D0(
__global const uchar* src,
int src_step, int src_offset,
__global const uchar* src,
int src_step, int src_offset,
__global int* globalHist,
int left_col, int cols,
int rows, int hist_step)
int rows, int hist_step)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidy = get_local_id(1);
int gx = get_group_id(0);
int gy = get_group_id(1);
@@ -160,9 +160,9 @@ __kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))c
globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
}
__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
__global int* hist,
int src_step)
__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
__global int* hist,
int src_step)
{
int lx = get_local_id(0);
int gx = get_group_id(0);
@@ -183,83 +183,83 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global
}
if(lx == 0)
hist[gx] = data[0];
hist[gx] = data[0];
}
__kernel __attribute__((reqd_work_group_size(256,1,1)))void calLUT(
__global uchar * dst,
__constant int * hist,
float scale)
__global uchar * dst,
__constant int * hist,
float scale)
{
int lid = get_local_id(0);
__local int sumhist[HISTOGRAM256_BIN_COUNT];
//__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
int lid = get_local_id(0);
__local int sumhist[HISTOGRAM256_BIN_COUNT];
//__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
sumhist[lid]=hist[lid];
barrier(CLK_LOCAL_MEM_FENCE);
if(lid==0)
{
int sum = 0;
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
{
sum+=sumhist[i];
sumhist[i]=sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
dst[lid]= lid == 0 ? 0 : convert_uchar_sat(convert_float(sumhist[lid])*scale);
sumhist[lid]=hist[lid];
barrier(CLK_LOCAL_MEM_FENCE);
if(lid==0)
{
int sum = 0;
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
{
sum+=sumhist[i];
sumhist[i]=sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
dst[lid]= lid == 0 ? 0 : convert_uchar_sat(convert_float(sumhist[lid])*scale);
}
/*
///////////////////////////////equalizeHist//////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
__global uchar * src,
__global uchar * dst,
__constant int * hist,
int srcstep,
int srcoffset,
int dststep,
int dstoffset,
int width,
int height,
float scale,
int inc_x,
int inc_y)
__global uchar * src,
__global uchar * dst,
__constant int * hist,
int srcstep,
int srcoffset,
int dststep,
int dstoffset,
int width,
int height,
float scale,
int inc_x,
int inc_y)
{
int gidx = get_global_id(0);
int lid = get_local_id(0);
int glb_size = get_global_size(0);
src+=srcoffset;
dst+=dstoffset;
__local int sumhist[HISTOGRAM256_BIN_COUNT];
__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
int gidx = get_global_id(0);
int lid = get_local_id(0);
int glb_size = get_global_size(0);
src+=srcoffset;
dst+=dstoffset;
__local int sumhist[HISTOGRAM256_BIN_COUNT];
__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
sumhist[lid]=hist[lid];
barrier(CLK_LOCAL_MEM_FENCE);
if(lid==0)
{
int sum = 0;
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
{
sum+=sumhist[i];
sumhist[i]=sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
lut[0]=0;
sumhist[lid]=hist[lid];
barrier(CLK_LOCAL_MEM_FENCE);
if(lid==0)
{
int sum = 0;
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
{
sum+=sumhist[i];
sumhist[i]=sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
lut[0]=0;
int pos_y = gidx / width;
int pos_x = gidx - mul24(pos_y, width);
for(int pos = gidx; pos < mul24(width,height); pos += glb_size)
{
int inaddr = mad24(pos_y,srcstep,pos_x);
int outaddr = mad24(pos_y,dststep,pos_x);
dst[outaddr] = lut[src[inaddr]];
pos_x +=inc_x;
int off = (pos_x >= width ? -1 : 0);
pos_x = mad24(off,width,pos_x);
pos_y += inc_y - off;
}
{
int inaddr = mad24(pos_y,srcstep,pos_x);
int outaddr = mad24(pos_y,dststep,pos_x);
dst[outaddr] = lut[src[inaddr]];
pos_x +=inc_x;
int off = (pos_x >= width ? -1 : 0);
pos_x = mad24(off,width,pos_x);
pos_y += inc_y - off;
}
}
*/

View File

@@ -73,27 +73,27 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
{
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
@@ -102,7 +102,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
@@ -113,23 +113,23 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
if(lid > 0 && (i+lid) <= rows){
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
@@ -139,7 +139,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
}
}
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
for(int k = 0; k < 4; k++)
@@ -147,7 +147,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -173,27 +173,27 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0;
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = sqsrc_t[0];
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = sqsrc_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
@@ -202,7 +202,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
@@ -213,14 +213,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
@@ -235,7 +235,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
{
int loc0 = gid * 2 * sum_step;
int loc1 = gid * 2 * sqsum_step;
for(int k = 1;k <= 8;k++)
for(int k = 1;k <= 8;k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
@@ -245,8 +245,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
if(lid > 0 && (i+lid) <= rows){
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
@@ -256,7 +256,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
if(gid * 8 + k >= cols) break;
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
}
}
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
for(int k = 0; k < 4; k++)
@@ -264,7 +264,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
if(gid * 8 + 4 + k >= cols) break;
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}

View File

@@ -70,23 +70,23 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
{
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
@@ -94,7 +94,7 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
@@ -104,32 +104,32 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if(lid > 0 && (i+lid) <= rows){
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
}
}
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -150,23 +150,23 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
{
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
@@ -174,7 +174,7 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
@@ -184,9 +184,9 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
@@ -201,13 +201,13 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
for(int k = 1;k <= 8;k++)
for(int k = 1;k <= 8;k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
}
}
if(lid > 0 && (i+lid) <= rows){
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
lm_sum[0][bf_loc] += sum_t[0];
@@ -223,7 +223,7 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
{
if(gid * 8 + 4 + k >= cols) break;
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}

View File

@@ -39,75 +39,75 @@
__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep, int m)
{
int dx = get_global_id(0)-(m>>1);
int dx = get_global_id(0)-(m>>1);
int dy = get_global_id(1)-(m>>1);
short histom[256];
for(int i=0;i<256;++i)
histom[i]=0;
for(int i=0;i<m;++i)
{
__global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
for(int j=dx;j<dx+m;++j)
{
histom[data[clamp(j, 0, cols-1)]]++;
}
}
short histom[256];
for(int i=0;i<256;++i)
histom[i]=0;
int now=0;
int goal=(m*m+1)>>1;
int v;
for(int i=0;i<256;++i)
{
v=(now<goal?i:v);
now+=histom[i];
}
if(dy<rows && dx<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
for(int i=0;i<m;++i)
{
__global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
for(int j=dx;j<dx+m;++j)
{
histom[data[clamp(j, 0, cols-1)]]++;
}
}
int now=0;
int goal=(m*m+1)>>1;
int v;
for(int i=0;i<256;++i)
{
v=(now<goal?i:v);
now+=histom[i];
}
if(dy<rows && dx<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
}
*/
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
__kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local uchar4 data[18][18];
__global uchar4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
__local uchar4 data[18][18];
__global uchar4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
int dy = get_global_id(1) - get_local_id(1) -1;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
barrier(CLK_LOCAL_MEM_FENCE);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
int x =get_local_id(0);
int y =get_local_id(1);
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
uchar4 mid;
barrier(CLK_LOCAL_MEM_FENCE);
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
int x =get_local_id(0);
int y =get_local_id(1);
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
uchar4 mid;
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
}
#undef op(a,b)
@@ -115,41 +115,41 @@ __kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,
__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local uchar data[18][18];
__global uchar* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
__local uchar data[18][18];
__global uchar* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
int dy = get_global_id(1) - get_local_id(1) -1;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
barrier(CLK_LOCAL_MEM_FENCE);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
int x =get_local_id(0);
int y =get_local_id(1);
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
uchar mid;
barrier(CLK_LOCAL_MEM_FENCE);
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
int x =get_local_id(0);
int y =get_local_id(1);
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
uchar mid;
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
}
#undef op(a,b)
@@ -157,41 +157,41 @@ __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, i
__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local float data[18][18];
__global float* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
__local float data[18][18];
__global float* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
int dy = get_global_id(1) - get_local_id(1) -1;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
barrier(CLK_LOCAL_MEM_FENCE);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
int x =get_local_id(0);
int y =get_local_id(1);
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
float mid;
barrier(CLK_LOCAL_MEM_FENCE);
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
int x =get_local_id(0);
int y =get_local_id(1);
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
float mid;
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
}
#undef op(a,b)
@@ -199,41 +199,41 @@ __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, i
__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local float4 data[18][18];
__global float4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
__local float4 data[18][18];
__global float4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -1;
int dy = get_global_id(1) - get_local_id(1) -1;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
int dr=id/18;
int dc=id%18;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
barrier(CLK_LOCAL_MEM_FENCE);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+9, 0, rows-1);
data[dr+9][dc] = source[r*srcStep + c];
int x =get_local_id(0);
int y =get_local_id(1);
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
float4 mid;
barrier(CLK_LOCAL_MEM_FENCE);
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
int x =get_local_id(0);
int y =get_local_id(1);
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
float4 mid;
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
op(p4, p2); op(p6, p4); op(p4, p2);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
}
#undef op(a,b)
@@ -241,36 +241,36 @@ __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,
__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local uchar4 data[20][20];
__global uchar4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
__local uchar4 data[20][20];
__global uchar4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
int dy = get_global_id(1) - get_local_id(1) -2;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
barrier(CLK_LOCAL_MEM_FENCE);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
int x =get_local_id(0);
int y =get_local_id(1);
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
uchar4 mid;
barrier(CLK_LOCAL_MEM_FENCE);
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
int x =get_local_id(0);
int y =get_local_id(1);
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
uchar4 mid;
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
@@ -293,9 +293,9 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)
@@ -303,36 +303,36 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local uchar data[20][20];
__global uchar* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
__local uchar data[20][20];
__global uchar* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
int dy = get_global_id(1) - get_local_id(1) -2;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
barrier(CLK_LOCAL_MEM_FENCE);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
int x =get_local_id(0);
int y =get_local_id(1);
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
uchar mid;
barrier(CLK_LOCAL_MEM_FENCE);
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
int x =get_local_id(0);
int y =get_local_id(1);
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
uchar mid;
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
@@ -355,9 +355,9 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, i
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)
@@ -365,36 +365,36 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, i
__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local float4 data[20][20];
__global float4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
__local float4 data[20][20];
__global float4* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
int dy = get_global_id(1) - get_local_id(1) -2;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
barrier(CLK_LOCAL_MEM_FENCE);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
int x =get_local_id(0);
int y =get_local_id(1);
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
float4 mid;
barrier(CLK_LOCAL_MEM_FENCE);
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
int x =get_local_id(0);
int y =get_local_id(1);
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
float4 mid;
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
@@ -417,9 +417,9 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)
@@ -427,36 +427,36 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
int rows, int srcStep, int dstStep)
{
__local float data[20][20];
__global float* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
__local float data[20][20];
__global float* source=src + srcOffset;
int dx = get_global_id(0) - get_local_id(0) -2;
int dy = get_global_id(1) - get_local_id(1) -2;
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
int dr=id/20;
int dc=id%20;
int r=clamp(dy+dr, 0, rows-1);
int c=clamp(dx+dc, 0, cols-1);
barrier(CLK_LOCAL_MEM_FENCE);
data[dr][dc] = source[r*srcStep + c];
r=clamp(dy+dr+10, 0, rows-1);
data[dr+10][dc] = source[r*srcStep + c];
int x =get_local_id(0);
int y =get_local_id(1);
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
float mid;
barrier(CLK_LOCAL_MEM_FENCE);
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
int x =get_local_id(0);
int y =get_local_id(1);
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
float mid;
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
@@ -479,9 +479,9 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, i
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
op(p7, p11); op(p11, p13); op(p11, p12);
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
if(get_global_id(1)<rows && get_global_id(0)<cols)
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)

View File

@@ -48,7 +48,7 @@
#if defined DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable
typedef double4 F4 ;
#else
#else
typedef float4 F4;
#endif
@@ -62,7 +62,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 2;
@@ -79,7 +79,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
uchar4 src_data;
src_data.s0 = *(src + srcIdx.s0);
@@ -88,10 +88,10 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
src_data.s3 = *(src + srcIdx.s3);
uchar4 dst_data;
dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *d;
uchar4 dVal = *d;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
@@ -107,7 +107,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 2;
@@ -125,7 +125,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
uchar4 src_data;
src_data.s0 = *(src + srcIdx.s0);
@@ -136,10 +136,10 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *d;
uchar4 dVal = *d;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
*d = dst_data;
@@ -152,7 +152,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 2;
@@ -173,7 +173,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
int8 map_dataZ = convert_int8_sat_rte(map_data);
int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset;
uchar4 src_data;
src_data.s0 = *(src + srcIdx.s0);
@@ -184,10 +184,10 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *d;
uchar4 dVal = *d;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
*d = dst_data;
}
@@ -230,7 +230,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *d;
uchar16 dVal = *d;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
@@ -279,7 +279,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *d;
uchar16 dVal = *d;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
@@ -333,7 +333,7 @@ __kernel void remapNNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *d;
uchar16 dVal = *d;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
@@ -351,9 +351,9 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
{
x = x << 4;
int gx = x - (dst_offset&15);
@@ -368,25 +368,25 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
short8 map1_data;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
float4 src_data;
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
float4 dst_data;
dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
float4 dVal = *d;
float4 dVal = *d;
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -402,7 +402,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 4;
@@ -422,23 +422,23 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
float4 src_data;
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
float4 dst_data;
dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
float4 dVal = *d;
float4 dVal = *d;
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -455,7 +455,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 4;
@@ -478,23 +478,23 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
int8 map1_dataZ = convert_int8_sat_rte(map_data);
int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
float4 src_data;
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
float4 dst_data;
dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
float4 dVal = *d;
float4 dVal = *d;
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -577,13 +577,13 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 2;
x = x << 2;
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
uchar val = nval.s0;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
@@ -607,7 +607,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
int4 src_StartD = src_StartU + src_step;
/*
/*
//not using the vload
int4 src_StartU1 = src_StartU + (int4)(1);
int4 src_StartD1 = src_StartD + (int4)(1);
@@ -617,7 +617,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
a.y = *(src_StartU.y + src);
a.z = *(src_StartU.z + src);
a.w = *(src_StartU.w + src);
b.x = *(src_StartU1.x + src);
b.y = *(src_StartU1.y + src);
b.z = *(src_StartU1.z + src);
@@ -649,7 +649,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
@@ -660,10 +660,10 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
d = (convert_uchar4(dc) == (uchar4)(0))? d : val;
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *D;
uchar4 dVal = *D;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
@@ -680,13 +680,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 2;
x = x << 2;
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
uchar val = nval.s0;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
@@ -713,7 +713,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
int4 src_StartD = src_StartU + src_step;
/*
/*
//not using the vload
int4 src_StartU1 = src_StartU + (int4)(1);
int4 src_StartD1 = src_StartD + (int4)(1);
@@ -723,7 +723,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
a.y = *(src_StartU.y + src);
a.z = *(src_StartU.z + src);
a.w = *(src_StartU.w + src);
b.x = *(src_StartU1.x + src);
b.y = *(src_StartU1.y + src);
b.z = *(src_StartU1.z + src);
@@ -755,7 +755,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
@@ -766,10 +766,10 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
d = (convert_uchar4(dc) == (uchar4)(0))? d : val;
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *D;
uchar4 dVal = *D;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
@@ -784,7 +784,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 2;
@@ -801,7 +801,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
uchar4 src_data;
src_data.s0 = *(src + srcIdx.s0);
@@ -810,10 +810,10 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
src_data.s3 = *(src + srcIdx.s3);
uchar4 dst_data;
dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *d;
uchar4 dVal = *d;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
@@ -835,7 +835,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 4;
x = x << 4;
int gx = x - (dst_offset&15);
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
@@ -854,7 +854,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
float4 v = temp.odd;
float4 ud = (float4)(1.0) - u;
float4 vd = (float4)(1.0) - v;
//float8 map1_dataU = map1_dataD + 1;
int4 map1_dataDx = map1_dataD.even;
@@ -888,7 +888,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
uchar16 val = (uchar16)(nval, nval, nval, nval);
a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
@@ -901,10 +901,10 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));
uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
__global uchar16* D = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *D;
uchar16 dVal = *D;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
@@ -922,7 +922,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 4;
x = x << 4;
int gx = x - (dst_offset&15);
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
@@ -944,7 +944,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
float4 v = temp.odd;
float4 ud = (float4)(1.0) - u;
float4 vd = (float4)(1.0) - v;
//float8 map1_dataU = map1_dataD + 1;
int4 map1_dataDx = map1_dataD.even;
@@ -978,7 +978,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
uchar16 val = (uchar16)(nval, nval, nval, nval);
a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
@@ -991,10 +991,10 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));
uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
__global uchar16* D = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *D;
uchar16 dVal = *D;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
@@ -1039,7 +1039,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *d;
uchar16 dVal = *d;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
@@ -1059,13 +1059,13 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 4;
x = x << 4;
int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
float4 nval =convert_float4(nVal);
float4 val = (float4)(nval.s0);
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
float8 map1_data;
@@ -1087,7 +1087,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
int4 src_StartD = src_StartU + src_step;
/*
/*
//not using the vload
int4 src_StartU1 = src_StartU + (int4)(1);
int4 src_StartD1 = src_StartD + (int4)(1);
@@ -1097,7 +1097,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
a.y = *(src_StartU.y + src);
a.z = *(src_StartU.z + src);
a.w = *(src_StartU.w + src);
b.x = *(src_StartU1.x + src);
b.y = *(src_StartU1.y + src);
b.z = *(src_StartU1.z + src);
@@ -1129,7 +1129,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
b = (float4)(aU.y, bU.y, cU.y, dU.y);
c = (float4)(aD.x, bD.x, cD.x, dD.x);
d = (float4)(aD.y, bD.y, cD.y, dD.y);
int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
@@ -1140,10 +1140,10 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
d = (convert_float4(dc) == (float4)(0))? d : val;
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
float4 dVal = *D;
float4 dVal = *D;
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -1160,13 +1160,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 4;
x = x << 4;
int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
float4 nval =convert_float4(nVal);
float4 val = (float4)(nval.s0);
int dstStart = y * dst_step + x + dst_offset - (dst_offset & 15);
int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
float4 map1_data;
@@ -1191,7 +1191,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
int4 src_StartD = src_StartU + src_step;
/*
/*
//not using the vload
int4 src_StartU1 = src_StartU + (int4)(1);
int4 src_StartD1 = src_StartD + (int4)(1);
@@ -1201,7 +1201,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
a.y = *(src_StartU.y + src);
a.z = *(src_StartU.z + src);
a.w = *(src_StartU.w + src);
b.x = *(src_StartU1.x + src);
b.y = *(src_StartU1.y + src);
b.z = *(src_StartU1.z + src);
@@ -1233,7 +1233,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
b = (float4)(aU.y, bU.y, cU.y, dU.y);
c = (float4)(aD.x, bD.x, cD.x, dD.x);
d = (float4)(aD.y, bD.y, cD.y, dD.y);
int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
@@ -1244,10 +1244,10 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
d = (convert_float4(dc) == (float4)(0))? d : val;
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
float4 dVal = *D;
float4 dVal = *D;
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -1261,9 +1261,9 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
{
x = x << 4;
int gx = x - (dst_offset&15);
@@ -1278,25 +1278,25 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
short8 map1_data;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) << (int4)(2)) + src_offset;
float4 src_data;
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
float4 dst_data;
dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
float4 dVal = *d;
float4 dVal = *d;
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
@@ -1348,7 +1348,7 @@ __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const
c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
*((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
}
@@ -1395,7 +1395,7 @@ __kernel void remapLNF1Constant_C4_D5(__global float * dst, __global float const
c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
*((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
}
@@ -1430,8 +1430,8 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im
short8 map1_data;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
int4 src_data;
@@ -1448,7 +1448,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
*d = dst_data;
*d = dst_data;
}
}
*/

View File

@@ -44,14 +44,14 @@
//M*/
// resize kernel
// resize kernel
// Currently, CV_8UC1 CV_8UC4 CV_32FC1 and CV_32FC4are supported.
// We shall support other types later if necessary.
#if defined DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#define F double
#else
#else
#define F float
#endif
@@ -63,12 +63,12 @@
#define INC(x,l) ((x+1) >= (l) ? (x):((x)+1))
__kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restrict src,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{
int gx = get_global_id(0);
int dy = get_global_id(1);
float4 sx, u, xf;
int4 x, DX;
gx = (gx<<2) - (dstoffset_in_pixel&3);
@@ -80,15 +80,15 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
float sy = ((dy+0.5f) * ify - 0.5f);
int y = floor(sy);
float v = sy - y;
u = x < 0 ? 0 : u;
u = (x >= src_cols) ? 0 : u;
x = x < 0 ? 0 : x;
x = (x >= src_cols) ? src_cols-1 : x;
y<0 ? y=0,v=0 : y;
y>=src_rows ? y=src_rows-1,v=0 : y;
int4 U, U1;
int V, V1;
float4 utmp1, utmp2;
@@ -96,8 +96,8 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
float4 scale_vec = INTER_RESIZE_COEF_SCALE;
utmp1 = u * scale_vec;
utmp2 = scale_vec - utmp1;
U = convert_int4(rint(utmp1));
U1 = convert_int4(rint(utmp2));
U = convert_int4(rint(utmp1));
U1 = convert_int4(rint(utmp2));
vtmp = v * INTER_RESIZE_COEF_SCALE;
V = rint(vtmp);
V1= rint(INTER_RESIZE_COEF_SCALE - vtmp);
@@ -137,42 +137,42 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
val1 = mul24(U1 , sdata1) + mul24(U , sdata2);
val2 = mul24(U1 , sdata3) + mul24(U , sdata4);
val = mul24((int4)V1 , val1) + mul24((int4)V , val2);
val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS);
pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
pos4.y++;
pos4.z+=2;
pos4.w+=3;
uchar4 uval = convert_uchar4_sat(val);
pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
pos4.y++;
pos4.z+=2;
pos4.w+=3;
uchar4 uval = convert_uchar4_sat(val);
int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0);
if(con)
{
*(__global uchar4*)(dst + pos4.x)=uval;
}
else
{
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.x]=uval.x;
}
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.y]=uval.y;
}
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.z]=uval.z;
}
if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.w]=uval.w;
}
}
if(con)
{
*(__global uchar4*)(dst + pos4.x)=uval;
}
else
{
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.x]=uval.x;
}
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.y]=uval.y;
}
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.z]=uval.z;
}
if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.w]=uval.w;
}
}
}
__kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{
int dx = get_global_id(0);
@@ -186,10 +186,10 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
x>=src_cols ? x=src_cols-1,u=0 : x,u;
y<0 ? y=0,v=0 : y,v;
y>=src_rows ? y=src_rows-1,v=0 : y,v;
u = u * INTER_RESIZE_COEF_SCALE;
v = v * INTER_RESIZE_COEF_SCALE;
int U = rint(u);
int V = rint(v);
int U1= rint(INTER_RESIZE_COEF_SCALE - u);
@@ -197,25 +197,25 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int y_ = INC(y,src_rows);
int x_ = INC(x,src_cols);
int4 srcpos;
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
int4 srcpos;
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
int4 data0 = convert_int4(src[srcpos.x]);
int4 data1 = convert_int4(src[srcpos.y]);
int4 data2 = convert_int4(src[srcpos.z]);
int4 data3 = convert_int4(src[srcpos.w]);
int4 val = mul24((int4)mul24(U1, V1) , data0) + mul24((int4)mul24(U, V1) , data1)
+mul24((int4)mul24(U1, V) , data2)+mul24((int4)mul24(U, V) , data3);
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
uchar4 uval = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dstpos] = uval;
}
__kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{
int dx = get_global_id(0);
@@ -229,16 +229,16 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
x>=src_cols ? x=src_cols-1,u=0 : x,u;
y<0 ? y=0,v=0 : y,v;
y>=src_rows ? y=src_rows-1,v=0 : y,v;
int y_ = INC(y,src_rows);
int x_ = INC(x,src_cols);
float u1 = 1.f-u;
float v1 = 1.f-v;
int4 srcpos;
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
float u1 = 1.f-u;
float v1 = 1.f-v;
int4 srcpos;
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
float data0 = src[srcpos.x];
float data1 = src[srcpos.y];
float data2 = src[srcpos.z];
@@ -248,13 +248,13 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
float val2 = u1 * data2 +
u * data3;
float val = v1 * val1 + v * val2;
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dstpos] = val;
dst[dstpos] = val;
}
__kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{
int dx = get_global_id(0);
@@ -268,43 +268,43 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
x>=src_cols ? x=src_cols-1,u=0 : x;
y<0 ? y=0,v=0 : y;
y>=src_rows ? y=src_rows-1,v=0 : y;
int y_ = INC(y,src_rows);
int x_ = INC(x,src_cols);
float u1 = 1.f-u;
float v1 = 1.f-v;
int4 srcpos;
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
float u1 = 1.f-u;
float v1 = 1.f-v;
int4 srcpos;
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
float4 s_data1, s_data2, s_data3, s_data4;
s_data1 = src[srcpos.x];
s_data2 = src[srcpos.y];
s_data3 = src[srcpos.z];
s_data4 = src[srcpos.w];
float4 val = u1 * v1 * s_data1 + u * v1 * s_data2
+u1 * v *s_data3 + u * v *s_data4;
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
+u1 * v *s_data3 + u * v *s_data4;
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dstpos] = val;
dst[dstpos] = val;
}
__kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
{
int gx = get_global_id(0);
int dy = get_global_id(1);
gx = (gx<<2) - (dstoffset_in_pixel&3);
//int4 GX = (int4)(gx, gx+1, gx+2, gx+3);
int4 sx;
int sy;
F ss1 = gx*ifx;
F ss2 = (gx+1)*ifx;
F ss2 = (gx+1)*ifx;
F ss3 = (gx+2)*ifx;
F ss4 = (gx+3)*ifx;
F s5 = dy * ify;
@@ -313,87 +313,87 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
sx.s2 = min((int)floor(ss3), src_cols-1);
sx.s3 = min((int)floor(ss4), src_cols-1);
sy = min((int)floor(s5), src_rows-1);
uchar4 val;
int4 pos = mad24((int4)sy, (int4)srcstep_in_pixel, sx+(int4)srcoffset_in_pixel);
val.s0 = src[pos.s0];
val.s1 = src[pos.s1];
val.s2 = src[pos.s2];
val.s3 = src[pos.s3];
//__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx);
//uchar4 dVal = *d;
pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
pos.y++;
pos.z+=2;
pos.w+=3;
pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
pos.y++;
pos.z+=2;
pos.w+=3;
int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0);
if(con)
{
*(__global uchar4*)(dst + pos.x)=val;
}
else
{
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.x]=val.x;
}
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.y]=val.y;
}
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.z]=val.z;
}
if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.w]=val.w;
}
}
if(con)
{
*(__global uchar4*)(dst + pos.x)=val;
}
else
{
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.x]=val.x;
}
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.y]=val.y;
}
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.z]=val.z;
}
if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.w]=val.w;
}
}
}
__kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
F s1 = dx*ifx;
F s2 = dy*ify;
int sx = fmin((float)floor(s1), (float)src_cols-1);
int sy = fmin((float)floor(s2), (float)src_rows-1);
int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dpos] = src[spos];
}
__kernel void resizeNN_C1_D5(__global float * dst, __global float * src,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
F s1 = dx*ifx;
F s2 = dy*ify;
int sx = fmin((float)floor(s1), (float)src_cols-1);
int sy = fmin((float)floor(s2), (float)src_rows-1);
int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dpos] = src[spos];
}
__kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
{
int dx = get_global_id(0);
@@ -406,9 +406,9 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
int sy = min(s_row, src_rows-1);
int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dpos] = src[spos];
}

View File

@@ -51,7 +51,7 @@
// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
// THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
int src_offset, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step,
uchar thresh, uchar max_val, int thresh_type
@@ -60,15 +60,15 @@ __kernel void threshold_C1_D0(__global const uchar * restrict src, __global ucha
int gx = get_global_id(0);
const int gy = get_global_id(1);
int offset = (dst_offset & 15);
src_offset -= offset;
int dstart = (gx << 4) - offset;
int offset = (dst_offset & 15);
src_offset -= offset;
int dstart = (gx << 4) - offset;
if(dstart < dst_cols && gy < dst_rows)
{
uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
uchar16 ddata;
uchar16 zero = 0;
uchar16 zero = 0;
switch (thresh_type)
{
case 0:
@@ -89,20 +89,20 @@ __kernel void threshold_C1_D0(__global const uchar * restrict src, __global ucha
default:
ddata = sdata;
}
int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
int16 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_uchar16(con != 0) ? ddata : dVal;
if(dstart < dst_cols)
{
*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
}
int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
int16 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_uchar16(con != 0) ? ddata : dVal;
if(dstart < dst_cols)
{
*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
}
}
}
__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
int src_offset, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step,
float thresh, float max_val, int thresh_type
@@ -110,16 +110,16 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
{
const int gx = get_global_id(0);
const int gy = get_global_id(1);
int offset = (dst_offset & 3);
src_offset -= offset;
int dstart = (gx << 2) - offset;
int offset = (dst_offset & 3);
src_offset -= offset;
int dstart = (gx << 2) - offset;
if(dstart < dst_cols && gy < dst_rows)
{
float4 sdata = vload4(gx, src+src_offset+gy*src_step);
float4 ddata;
float4 zero = 0;
float4 zero = 0;
switch (thresh_type)
{
case 0:
@@ -140,14 +140,14 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
default:
ddata = sdata;
}
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
int4 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_float4(con) != 0 ? ddata : dVal;
if(dstart < dst_cols)
{
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
}
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
int4 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_float4(con) != 0 ? ddata : dVal;
if(dstart < dst_cols)
{
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
}
}
}

View File

@@ -52,7 +52,7 @@
typedef double F;
typedef double4 F4;
#define convert_F4 convert_double4
#else
#else
typedef float F;
typedef float4 F4;
#define convert_F4 convert_float4
@@ -61,9 +61,9 @@ typedef float4 F4;
#define INTER_BITS 5
#define INTER_TAB_SIZE (1 << INTER_BITS)
#define INTER_SCALE 1.f/INTER_TAB_SIZE
#define AB_BITS max(10, (int)INTER_BITS)
#define AB_SCALE (1 << AB_BITS)
#define INTER_SCALE 1.f/INTER_TAB_SIZE
#define AB_BITS max(10, (int)INTER_BITS)
#define AB_SCALE (1 << AB_BITS)
#define INTER_REMAP_COEF_BITS 15
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
@@ -81,7 +81,7 @@ inline void interpolateCubic( float x, float* coeffs )
/**********************************************8UC1*********************************************
***********************************************************************************************/
__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
@@ -90,9 +90,9 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
if( dx < threadCols && dy < dst_rows)
{
dx = (dx<<2) - (dst_offset&3);
int round_delta = (AB_SCALE>>1);
int4 X, Y;
int4 sx, sy;
int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
@@ -105,13 +105,13 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
int tmp1, tmp2;
tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
X += tmp1 + round_delta;
Y += tmp2 + round_delta;
sx = convert_int4(convert_short4(X >> AB_BITS));
sy = convert_int4(convert_short4(Y >> AB_BITS));
__global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
uchar4 dval = *d;
DX = (int4)(dx, dx+1, dx+2, dx+3);
@@ -129,7 +129,7 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
}
__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
@@ -139,9 +139,9 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
if( dx < threadCols && dy < dst_rows)
{
dx = (dx<<2) - (dst_offset&3);
int round_delta = ((AB_SCALE >> INTER_BITS) >> 1);
int4 X, Y;
short4 ax, ay;
int4 sx, sy;
@@ -152,22 +152,22 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
M3DX = M[3] * convert_F4(DX);
X = convert_int4(rint(M0DX));
Y = convert_int4(rint(M3DX));
int tmp1, tmp2;
tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
X += tmp1 + round_delta;
Y += tmp2 + round_delta;
X = X >> (AB_BITS - INTER_BITS);
Y = Y >> (AB_BITS - INTER_BITS);
sx = convert_int4(convert_short4(X >> INTER_BITS));
sy = convert_int4(convert_short4(Y >> INTER_BITS));
ax = convert_short4(X & (INTER_TAB_SIZE-1));
ay = convert_short4(Y & (INTER_TAB_SIZE-1));
uchar4 v0, v1, v2,v3;
int4 scon0, scon1, scon2, scon3;
int4 spos0, spos1, spos2, spos3;
@@ -200,12 +200,12 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
short4 itab0, itab1, itab2, itab3;
float4 taby, tabx;
taby = INTER_SCALE * convert_float4(ay);
tabx = INTER_SCALE * convert_float4(ax);
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
@@ -214,30 +214,30 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
int4 val;
uchar4 tval;
val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
+ convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
__global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
uchar4 dval = *d;
DX = (int4)(dx, dx+1, dx+2, dx+3);
int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
dval = convert_uchar4(dcon != 0) ? tval : dval;
*d = dval;
}
}
}
__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -249,10 +249,10 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
short sy = (short)(Y >> INTER_BITS) - 1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
uchar v[16];
int i, j;
#pragma unroll 4
for(i=0; i<4; i++)
for(j=0; j<4; j++)
@@ -269,14 +269,14 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
int isum = 0;
#pragma unroll 16
for( i=0; i<16; i++ )
{
F v = tab1y[(i>>2)] * tab1x[(i&3)];
isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
}
if( isum != INTER_REMAP_COEF_SCALE )
{
int k1, k2;
@@ -309,16 +309,16 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
***********************************************************************************************/
__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = (AB_SCALE >> 1);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -326,26 +326,26 @@ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global
int sx0 = (short)(X0 >> AB_BITS);
int sy0 = (short)(Y0 >> AB_BITS);
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
}
}
__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>2);
srcStep = (srcStep>>2);
srcStep = (srcStep>>2);
int tmp = (dx << AB_BITS);
int X0 = rint(M[0] * tmp);
@@ -359,7 +359,7 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
short sy0 = (short)(Y0 >> INTER_BITS);
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
int4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0;
@@ -371,36 +371,36 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
float taby, tabx;
taby = 1.f/INTER_TAB_SIZE*ay0;
tabx = 1.f/INTER_TAB_SIZE*ax0;
itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
int4 val;
val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
src_offset = (src_offset>>2);
srcStep = (srcStep>>2);
srcStep = (srcStep>>2);
dst_offset = (dst_offset>>2);
dstStep = (dstStep>>2);
dstStep = (dstStep>>2);
int tmp = (dx << AB_BITS);
int X0 = rint(M[0] * tmp);
int Y0 = rint(M[3] * tmp);
@@ -413,7 +413,7 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
int sy = (short)(Y0 >> INTER_BITS) - 1;
int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
int ax = (short)(X0 & (INTER_TAB_SIZE-1));
uchar4 v[16];
int i,j;
#pragma unroll 4
@@ -431,7 +431,7 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
int isum = 0;
#pragma unroll 16
for( i=0; i<16; i++ )
{
@@ -446,17 +446,17 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
int k1, k2;
int diff = isum - INTER_REMAP_COEF_SCALE;
int Mk1=2, Mk2=2, mk1=2, mk2=2;
for( k1 = 2; k1 < 4; k1++ )
for( k2 = 2; k2 < 4; k2++ )
{
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2;
}
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
}
@@ -477,16 +477,16 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
***********************************************************************************************/
__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/2;
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -494,25 +494,25 @@ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int
short sx0 = (short)(X0 >> AB_BITS);
short sy0 = (short)(Y0 >> AB_BITS);
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
}
}
__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>2);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -524,7 +524,7 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
short sy0 = (short)(Y0 >> INTER_BITS);
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
float v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
@@ -538,33 +538,33 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0];
tab[1] = taby[0] * tabx[1];
tab[2] = taby[1] * tabx[0];
tab[3] = taby[1] * tabx[1];
float sum = 0;
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
}
}
__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>2);
dst_offset = (dst_offset>>2);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -576,7 +576,7 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
short sy = (short)(Y0 >> INTER_BITS) - 1;
short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
short ax = (short)(X0 & (INTER_TAB_SIZE-1));
float v[16];
int i;
@@ -597,7 +597,7 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
{
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
float sum = 0;
@@ -617,16 +617,16 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
***********************************************************************************************/
__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/2;
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -634,28 +634,28 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i
short sx0 = (short)(X0 >> AB_BITS);
short sy0 = (short)(Y0 >> AB_BITS);
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0;
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0;
}
}
__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>4);
dst_offset = (dst_offset>>4);
srcStep = (srcStep>>2);
dstStep = (dstStep>>2);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -667,7 +667,7 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
short sy0 = (short)(Y0 >> INTER_BITS);
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
float4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
@@ -681,35 +681,35 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0];
tab[1] = taby[0] * tabx[1];
tab[2] = taby[1] * tabx[0];
tab[3] = taby[1] * tabx[1];
float4 sum = 0;
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[dst_offset+dy*dstStep+dx] = sum;
}
}
__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
src_offset = (src_offset>>4);
dst_offset = (dst_offset>>4);
srcStep = (srcStep>>2);
dstStep = (dstStep>>2);
int X0 = rint(M[0] * dx * AB_SCALE);
int Y0 = rint(M[3] * dx * AB_SCALE);
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
@@ -721,7 +721,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
short sy = (short)(Y0 >> INTER_BITS) - 1;
short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
short ax = (short)(X0 & (INTER_TAB_SIZE-1));
float4 v[16];
int i;
@@ -742,7 +742,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
{
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
float4 sum = 0;

View File

@@ -52,7 +52,7 @@
typedef double F;
typedef double4 F4;
#define convert_F4 convert_double4
#else
#else
typedef float F;
typedef float4 F4;
#define convert_F4 convert_float4
@@ -61,9 +61,9 @@ typedef float4 F4;
#define INTER_BITS 5
#define INTER_TAB_SIZE (1 << INTER_BITS)
#define INTER_SCALE 1.f/INTER_TAB_SIZE
#define AB_BITS max(10, (int)INTER_BITS)
#define AB_SCALE (1 << AB_BITS)
#define INTER_SCALE 1.f/INTER_TAB_SIZE
#define AB_BITS max(10, (int)INTER_BITS)
#define AB_SCALE (1 << AB_BITS)
#define INTER_REMAP_COEF_BITS 15
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
@@ -81,7 +81,7 @@ inline void interpolateCubic( float x, float* coeffs )
/**********************************************8UC1*********************************************
***********************************************************************************************/
__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
@@ -90,7 +90,7 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
if( dx < threadCols && dy < dst_rows)
{
dx = (dx<<2) - (dst_offset&3);
F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
F4 X0 = M[0]*DX + M[1]*dy + M[2];
F4 Y0 = M[3]*DX + M[4]*dy + M[5];
@@ -118,12 +118,12 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
}
__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
@@ -132,12 +132,12 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
int sx = (short)(X >> INTER_BITS);
int sy = (short)(Y >> INTER_BITS);
int ay = (short)(Y & (INTER_TAB_SIZE-1));
int ax = (short)(X & (INTER_TAB_SIZE-1));
uchar v[4];
int i;
#pragma unroll 4
@@ -150,7 +150,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
#pragma unroll 4
for(i=0; i<4; i++)
{
@@ -170,12 +170,12 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
}
__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
@@ -184,15 +184,15 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS) - 1;
short sy = (short)(Y >> INTER_BITS) - 1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
uchar v[16];
int i, j;
#pragma unroll 4
for(i=0; i<4; i++)
for(j=0; j<4; j++)
@@ -208,7 +208,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
axx = 1.f/INTER_TAB_SIZE * ax;
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
int isum = 0;
#pragma unroll 16
for( i=0; i<16; i++ )
@@ -249,12 +249,12 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
***********************************************************************************************/
__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
@@ -266,37 +266,37 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl
int Y = rint(Y0*W);
short sx = (short)X;
short sy = (short)Y;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
}
}
__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>2);
srcStep = (srcStep>>2);
srcStep = (srcStep>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS);
short sy = (short)(Y >> INTER_BITS);
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
int4 v0, v1, v2, v3;
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0;
@@ -308,46 +308,46 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
float taby, tabx;
taby = 1.f/INTER_TAB_SIZE*ay;
tabx = 1.f/INTER_TAB_SIZE*ax;
itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
int4 val;
val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
}
}
__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>2);
srcStep = (srcStep>>2);
srcStep = (srcStep>>2);
dst_offset = (dst_offset>>2);
dstStep = (dstStep>>2);
dstStep = (dstStep>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS) - 1;
short sy = (short)(Y >> INTER_BITS) - 1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
uchar4 v[16];
int i,j;
#pragma unroll 4
@@ -365,7 +365,7 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
int isum = 0;
#pragma unroll 16
for( i=0; i<16; i++ )
{
@@ -380,17 +380,17 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
int k1, k2;
int diff = isum - INTER_REMAP_COEF_SCALE;
int Mk1=2, Mk2=2, mk1=2, mk2=2;
for( k1 = 2; k1 < 4; k1++ )
for( k2 = 2; k2 < 4; k2++ )
{
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
mk1 = k1, mk2 = k2;
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
Mk1 = k1, Mk2 = k2;
}
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
}
@@ -411,12 +411,12 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
***********************************************************************************************/
__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
@@ -429,33 +429,33 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst
short sy = (short)Y;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
}
}
__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS);
short sy = (short)(Y >> INTER_BITS);
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
float v0, v1, v2, v3;
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0;
@@ -469,38 +469,38 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
taby[1] = 1.f/INTER_TAB_SIZE*ay;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
tabx[1] = 1.f/INTER_TAB_SIZE*ax;
tab[0] = taby[0] * tabx[0];
tab[1] = taby[0] * tabx[1];
tab[2] = taby[1] * tabx[0];
tab[3] = taby[1] * tabx[1];
float sum = 0;
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
}
}
__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>2);
dst_offset = (dst_offset>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS) - 1;
short sy = (short)(Y >> INTER_BITS) - 1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
@@ -526,7 +526,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
{
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
float sum = 0;
@@ -546,12 +546,12 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
***********************************************************************************************/
__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
F X0 = M[0]*dx + M[1]*dy + M[2];
@@ -562,39 +562,39 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
int Y = rint(Y0*W);
short sx = (short)X;
short sy = (short)Y;
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0;
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0;
}
}
__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int dst_cols, int dst_rows, int srcStep, int dstStep,
int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows)
{
src_offset = (src_offset>>4);
dst_offset = (dst_offset>>4);
srcStep = (srcStep>>2);
dstStep = (dstStep>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx0 = (short)(X >> INTER_BITS);
short sy0 = (short)(Y >> INTER_BITS);
short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
short ax0 = (short)(X & (INTER_TAB_SIZE-1));
float4 v0, v1, v2, v3;
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
@@ -608,46 +608,46 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
tab[0] = taby[0] * tabx[0];
tab[1] = taby[0] * tabx[1];
tab[2] = taby[1] * tabx[0];
tab[3] = taby[1] * tabx[1];
float4 sum = 0;
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
dst[dst_offset+dy*dstStep+dx] = sum;
}
}
__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if( dx < threadCols && dy < dst_rows )
{
src_offset = (src_offset>>4);
dst_offset = (dst_offset>>4);
srcStep = (srcStep>>2);
dstStep = (dstStep>>2);
F X0 = M[0]*dx + M[1]*dy + M[2];
F Y0 = M[3]*dx + M[4]*dy + M[5];
F W = M[6]*dx + M[7]*dy + M[8];
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
int X = rint(X0*W);
int Y = rint(Y0*W);
short sx = (short)(X >> INTER_BITS)-1;
short sy = (short)(Y >> INTER_BITS)-1;
short ay = (short)(Y & (INTER_TAB_SIZE-1));
short ax = (short)(X & (INTER_TAB_SIZE-1));
float4 v[16];
int i;
@@ -668,7 +668,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
{
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
}
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
{
float4 sum = 0;

View File

@@ -1,252 +1,252 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
// atomic add for 32bit floating point
inline void atomic_addf(volatile __global float *source, const float operand) {
union {
unsigned int intVal;
float floatVal;
} newVal;
union {
unsigned int intVal;
float floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void memsetKernel(
float val,
__global float * image,
int width,
int height,
int step, // in element
int offset
)
{
if(get_global_id(0) >= width || get_global_id(1) >= height)
{
return;
}
image += offset;
image[get_global_id(0) + get_global_id(1) * step] = val;
}
__kernel void normalizeKernel(
__global float * buffer,
int width,
int height,
int step,
int f_offset,
int d_offset
)
{
__global float * factors = buffer + f_offset;
__global float * dst = buffer + d_offset;
int j = get_global_id(0);
int i = get_global_id(1);
if(j >= width || i >= height)
{
return;
}
float scale = factors[step * i + j];
float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
dst[step * i + j] *= invScale;
}
__kernel void forwardWarpKernel(
__global const float * src,
__global float * buffer,
__global const float * u,
__global const float * v,
const int w,
const int h,
const int flow_stride,
const int image_stride,
const int factor_offset,
const int dst_offset,
const float time_scale
)
{
int j = get_global_id(0);
int i = get_global_id(1);
if (i >= h || j >= w) return;
volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
int flow_row_offset = i * flow_stride;
int image_row_offset = i * image_stride;
//bottom left corner of a target pixel
float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
// pixel containing bottom left corner
float px;
float py;
float dx = modf(cx, &px);
float dy = modf(cy, &py);
// target pixel integer coords
int tx;
int ty;
tx = (int) px;
ty = (int) py;
float value = src[image_row_offset + j];
float weight;
// fill pixel containing bottom right corner
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = dx * dy;
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing bottom left corner
tx -= 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = (1.0f - dx) * dy;
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing upper left corner
ty -= 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = (1.0f - dx) * (1.0f - dy);
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing upper right corner
tx += 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = dx * (1.0f - dy);
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
}
// define buffer offsets
enum
{
O0_OS = 0,
O1_OS,
U_OS,
V_OS,
UR_OS,
VR_OS
};
__kernel void blendFramesKernel(
image2d_t tex_src0,
image2d_t tex_src1,
__global float * buffer,
__global float * out,
int w,
int h,
int step,
float theta
)
{
__global float * u = buffer + h * step * U_OS;
__global float * v = buffer + h * step * V_OS;
__global float * ur = buffer + h * step * UR_OS;
__global float * vr = buffer + h * step * VR_OS;
__global float * o0 = buffer + h * step * O0_OS;
__global float * o1 = buffer + h * step * O1_OS;
int ix = get_global_id(0);
int iy = get_global_id(1);
if(ix >= w || iy >= h) return;
int pos = ix + step * iy;
float _u = u[pos];
float _v = v[pos];
float _ur = ur[pos];
float _vr = vr[pos];
float x = (float)ix + 0.5f;
float y = (float)iy + 0.5f;
bool b0 = o0[pos] > 1e-4f;
bool b1 = o1[pos] > 1e-4f;
float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
if (b0 && b1)
{
// pixel is visible on both frames
out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) +
read_imagef(tex_src1, sampler, coord1).x * theta;
}
else if (b0)
{
// visible on the first frame only
out[pos] = read_imagef(tex_src0, sampler, coord0).x;
}
else
{
// visible on the second frame only
out[pos] = read_imagef(tex_src1, sampler, coord1).x;
}
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
// atomic add for 32bit floating point
inline void atomic_addf(volatile __global float *source, const float operand) {
union {
unsigned int intVal;
float floatVal;
} newVal;
union {
unsigned int intVal;
float floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void memsetKernel(
float val,
__global float * image,
int width,
int height,
int step, // in element
int offset
)
{
if(get_global_id(0) >= width || get_global_id(1) >= height)
{
return;
}
image += offset;
image[get_global_id(0) + get_global_id(1) * step] = val;
}
__kernel void normalizeKernel(
__global float * buffer,
int width,
int height,
int step,
int f_offset,
int d_offset
)
{
__global float * factors = buffer + f_offset;
__global float * dst = buffer + d_offset;
int j = get_global_id(0);
int i = get_global_id(1);
if(j >= width || i >= height)
{
return;
}
float scale = factors[step * i + j];
float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
dst[step * i + j] *= invScale;
}
__kernel void forwardWarpKernel(
__global const float * src,
__global float * buffer,
__global const float * u,
__global const float * v,
const int w,
const int h,
const int flow_stride,
const int image_stride,
const int factor_offset,
const int dst_offset,
const float time_scale
)
{
int j = get_global_id(0);
int i = get_global_id(1);
if (i >= h || j >= w) return;
volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
int flow_row_offset = i * flow_stride;
int image_row_offset = i * image_stride;
//bottom left corner of a target pixel
float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
// pixel containing bottom left corner
float px;
float py;
float dx = modf(cx, &px);
float dy = modf(cy, &py);
// target pixel integer coords
int tx;
int ty;
tx = (int) px;
ty = (int) py;
float value = src[image_row_offset + j];
float weight;
// fill pixel containing bottom right corner
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = dx * dy;
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing bottom left corner
tx -= 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = (1.0f - dx) * dy;
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing upper left corner
ty -= 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = (1.0f - dx) * (1.0f - dy);
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing upper right corner
tx += 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = dx * (1.0f - dy);
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
}
// define buffer offsets
enum
{
O0_OS = 0,
O1_OS,
U_OS,
V_OS,
UR_OS,
VR_OS
};
__kernel void blendFramesKernel(
image2d_t tex_src0,
image2d_t tex_src1,
__global float * buffer,
__global float * out,
int w,
int h,
int step,
float theta
)
{
__global float * u = buffer + h * step * U_OS;
__global float * v = buffer + h * step * V_OS;
__global float * ur = buffer + h * step * UR_OS;
__global float * vr = buffer + h * step * VR_OS;
__global float * o0 = buffer + h * step * O0_OS;
__global float * o1 = buffer + h * step * O1_OS;
int ix = get_global_id(0);
int iy = get_global_id(1);
if(ix >= w || iy >= h) return;
int pos = ix + step * iy;
float _u = u[pos];
float _v = v[pos];
float _ur = ur[pos];
float _vr = vr[pos];
float x = (float)ix + 0.5f;
float y = (float)iy + 0.5f;
bool b0 = o0[pos] > 1e-4f;
bool b1 = o1[pos] > 1e-4f;
float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
if (b0 && b1)
{
// pixel is visible on both frames
out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) +
read_imagef(tex_src1, sampler, coord1).x * theta;
}
else if (b0)
{
// visible on the first frame only
out[pos] = read_imagef(tex_src0, sampler, coord0).x;
}
else
{
// visible on the second frame only
out[pos] = read_imagef(tex_src1, sampler, coord1).x;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -50,8 +50,8 @@ typedef double F;
typedef float F;
#endif
short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
__global uchar4* in, int in_step, int dst_off, int src_off,
short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
__global uchar4* in, int in_step, int dst_off, int src_off,
int cols, int rows, int sp, int sr, int maxIter, float eps)
{
int isr2 = sr*sr;
@@ -81,9 +81,9 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
for( int y = miny; y <= maxy; y++)
{
int rowCount = 0;
int x = minx;
int x = minx;
for( ; x+3 <= maxx; x+=4 )
{
{
int id = src_off + y*in_step + x;
uchar16 t = (uchar16)(in[id],in[id+1],in[id+2],in[id+3]);
int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
@@ -126,7 +126,7 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
s.x += t.s0; s.y += t.s1; s.z += t.s2;
sx += x; rowCount++;
}
}
if(x+1 == maxx)
{
@@ -213,32 +213,32 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
}
__kernel void meanshift_kernel(__global uchar4* out, int out_step,
__global uchar4* in, int in_step,
__kernel void meanshift_kernel(__global uchar4* out, int out_step,
__global uchar4* in, int in_step,
int dst_off, int src_off, int cols, int rows,
int sp, int sr, int maxIter, float eps)
{
int x0 = get_global_id(0);
int y0 = get_global_id(1);
int x0 = get_global_id(0);
int y0 = get_global_id(1);
if( x0 < cols && y0 < rows )
do_mean_shift(x0, y0, out, out_step, in, in_step, dst_off, src_off,
cols, rows, sp, sr, maxIter, eps);
}
__kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
__global short2* outsp, int instep, int outrstep,
__kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
__global short2* outsp, int instep, int outrstep,
int outspstep, int in_off, int outr_off, int outsp_off,
int cols, int rows, int sp, int sr, int maxIter, float eps )
{
int x0 = get_global_id(0);
int y0 = get_global_id(1);
int x0 = get_global_id(0);
int y0 = get_global_id(1);
if( x0 < cols && y0 < rows )
{
//int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
//*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
// we have ensured before that ((outspstep & 0x11)==0).
outsp_off >>= 2;
outsp_off >>= 2;
outspstep >>= 2;
int basesp = outsp_off + y0 * outspstep + x0;
outsp[basesp] = do_mean_shift(x0, y0, outr, outrstep, in, instep, outr_off, in_off, cols, rows, sp, sr, maxIter, eps);

View File

@@ -59,25 +59,25 @@ __kernel void merge_vector_C2_D0(__global uchar *mat_dst, int dst_step, int ds
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
x = x << 1;
#define dst_align ((dst_offset & 3) >> 1)
int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
__global uchar4 * dst = (__global uchar4 *)(mat_dst + dst_index);
__global uchar * src0 = mat_src0 + src0_index;
__global uchar * src1 = src0 + 1;
__global uchar * src2 = mat_src1 + src1_index;
__global uchar * src3 = src2 + 1;
__global uchar4 * dst = (__global uchar4 *)(mat_dst + dst_index);
__global uchar * src0 = mat_src0 + src0_index;
__global uchar * src1 = src0 + 1;
__global uchar * src2 = mat_src1 + src1_index;
__global uchar * src3 = src2 + 1;
uchar4 dst_data = *dst;
uchar data_0 = *(src0);
@@ -87,8 +87,8 @@ __kernel void merge_vector_C2_D0(__global uchar *mat_dst, int dst_step, int ds
uchar4 tmp_data = (uchar4)(data_0, data_2, data_1, data_3);
tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
tmp_data.zw = dst_index + 2 < dst_end ? tmp_data.zw : dst_data.zw;
tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
tmp_data.zw = dst_index + 2 < dst_end ? tmp_data.zw : dst_data.zw;
*dst = tmp_data;
}
@@ -100,25 +100,25 @@ __kernel void merge_vector_C2_D1(__global char *mat_dst, int dst_step, int dst
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
x = x << 1;
#define dst_align ((dst_offset & 3) >> 1)
int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
__global char4 * dst = (__global char4 *)(mat_dst + dst_index);
__global char * src0 = mat_src0 + src0_index;
__global char * src1 = src0 + 1;
__global char * src2 = mat_src1 + src1_index;
__global char * src3 = src2 + 1;
__global char4 * dst = (__global char4 *)(mat_dst + dst_index);
__global char * src0 = mat_src0 + src0_index;
__global char * src1 = src0 + 1;
__global char * src2 = mat_src1 + src1_index;
__global char * src3 = src2 + 1;
char4 dst_data = *dst;
char data_0 = *(src0);
@@ -128,8 +128,8 @@ __kernel void merge_vector_C2_D1(__global char *mat_dst, int dst_step, int dst
char4 tmp_data = (char4)(data_0, data_2, data_1, data_3);
tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
tmp_data.zw = dst_index + 2 < dst_end ? tmp_data.zw : dst_data.zw;
tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
tmp_data.zw = dst_index + 2 < dst_end ? tmp_data.zw : dst_data.zw;
*dst = tmp_data;
}
@@ -141,12 +141,12 @@ __kernel void merge_vector_C2_D2(__global ushort *mat_dst, int dst_step, int d
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int dst_index = mad24(y, dst_step , dst_offset);
@@ -167,12 +167,12 @@ __kernel void merge_vector_C2_D3(__global short *mat_dst, int dst_step, int ds
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int dst_index = mad24(y, dst_step , dst_offset);
@@ -193,12 +193,12 @@ __kernel void merge_vector_C2_D4(__global int *mat_dst, int dst_step, int dst_
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int dst_index = mad24(y, dst_step , dst_offset);
int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
@@ -213,12 +213,12 @@ __kernel void merge_vector_C2_D5(__global float *mat_dst, int dst_step, int ds
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int dst_index = mad24(y, dst_step , dst_offset);
float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
@@ -235,12 +235,12 @@ __kernel void merge_vector_C2_D6(__global double *mat_dst, int dst_step, int d
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int dst_index = mad24(y, dst_step , dst_offset);
double src0 = *((__global double *)((__global uchar *)mat_src0 + src0_index + (x << 3)));
@@ -258,8 +258,8 @@ __kernel void merge_vector_C3_D0(__global uchar *mat_dst, int dst_step, int ds
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
x = x << 2;
@@ -268,8 +268,8 @@ __kernel void merge_vector_C3_D0(__global uchar *mat_dst, int dst_step, int ds
int src1_index = mad24(y, src1_step, x + src1_offset - offset_cols);
int src2_index = mad24(y, src2_step, x + src2_offset - offset_cols);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + 3 * x - offset_cols * 3);
uchar data0_0 = *(mat_src0 + src0_index + 0);
@@ -322,8 +322,8 @@ __kernel void merge_vector_C3_D1(__global char *mat_dst, int dst_step, int dst
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
x = x << 2;
@@ -332,8 +332,8 @@ __kernel void merge_vector_C3_D1(__global char *mat_dst, int dst_step, int dst
int src1_index = mad24(y, src1_step, x + src1_offset - offset_cols);
int src2_index = mad24(y, src2_step, x + src2_offset - offset_cols);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + 3 * x - offset_cols * 3);
char data0_0 = *(mat_src0 + src0_index + 0);
@@ -386,8 +386,8 @@ __kernel void merge_vector_C3_D2(__global ushort *mat_dst, int dst_step, int d
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
x = x << 1;
@@ -396,8 +396,8 @@ __kernel void merge_vector_C3_D2(__global ushort *mat_dst, int dst_step, int d
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - offset_cols);
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - offset_cols);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + 6 * x - offset_cols * 6);
ushort data0_0 = *((__global ushort *)((__global char *)mat_src0 + src0_index + 0));
@@ -438,8 +438,8 @@ __kernel void merge_vector_C3_D3(__global short *mat_dst, int dst_step, int ds
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
x = x << 1;
@@ -448,8 +448,8 @@ __kernel void merge_vector_C3_D3(__global short *mat_dst, int dst_step, int ds
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - offset_cols);
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - offset_cols);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + 6 * x - offset_cols * 6);
short data0_0 = *((__global short *)((__global char *)mat_src0 + src0_index + 0));
@@ -490,13 +490,13 @@ __kernel void merge_vector_C3_D4(__global int *mat_dst, int dst_step, int dst_
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int dst_index = mad24(y, dst_step , dst_offset);
@@ -524,13 +524,13 @@ __kernel void merge_vector_C3_D5(__global float *mat_dst, int dst_step, int ds
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int dst_index = mad24(y, dst_step , dst_offset);
@@ -560,13 +560,13 @@ __kernel void merge_vector_C3_D6(__global double *mat_dst, int dst_step, int d
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int dst_index = mad24(y, dst_step , dst_offset);
@@ -596,14 +596,14 @@ __kernel void merge_vector_C4_D0(__global uchar *mat_dst, int dst_step, int ds
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int dst_index = mad24(y, dst_step , dst_offset);
uchar src0 = *(mat_src0 + src0_index + x );
@@ -622,14 +622,14 @@ __kernel void merge_vector_C4_D1(__global char *mat_dst, int dst_step, int dst
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int dst_index = mad24(y, dst_step , dst_offset);
char src0 = *(mat_src0 + src0_index + x );
@@ -648,14 +648,14 @@ __kernel void merge_vector_C4_D2(__global ushort *mat_dst, int dst_step, int d
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int dst_index = mad24(y, dst_step , dst_offset);
ushort src0 = *((__global ushort *)((__global uchar *)mat_src0 + src0_index + (x << 1)));
@@ -674,14 +674,14 @@ __kernel void merge_vector_C4_D3(__global short *mat_dst, int dst_step, int ds
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int dst_index = mad24(y, dst_step , dst_offset);
short src0 = *((__global short *)((__global uchar *)mat_src0 + src0_index + (x << 1)));
@@ -700,14 +700,14 @@ __kernel void merge_vector_C4_D4(__global int *mat_dst, int dst_step, int dst_
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int dst_index = mad24(y, dst_step , dst_offset);
int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
@@ -726,14 +726,14 @@ __kernel void merge_vector_C4_D5(__global float *mat_dst, int dst_step, int ds
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int dst_index = mad24(y, dst_step , dst_offset);
float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
@@ -754,14 +754,14 @@ __kernel void merge_vector_C4_D6(__global double *mat_dst, int dst_step, int d
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int src0_index = mad24(y, src0_step, src0_offset);
int src1_index = mad24(y, src1_step, src1_offset);
int src2_index = mad24(y, src2_step, src2_offset);
int src3_index = mad24(y, src3_step, src3_offset);
int dst_index = mad24(y, dst_step , dst_offset);
double src0 = *((__global double *)((__global uchar *)mat_src0 + src0_index + (x << 3)));
@@ -783,8 +783,8 @@ __kernel void merge_vector_C2_D0_1(int rows, int cols,
__global uchar *mat_src1, int src1_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global uchar4 *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
@@ -807,8 +807,8 @@ __kernel void merge_vector_C2_D1_1(int rows, int cols,
__global char *mat_src1, int src1_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global char4 *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
@@ -831,8 +831,8 @@ __kernel void merge_vector_C2_D2_1(int rows, int cols,
__global ushort *mat_src1, int src1_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global ushort2 *src0_y = (__global ushort2 *)((__global uchar *)mat_src0 + y * src0_step);
@@ -855,8 +855,8 @@ __kernel void merge_vector_C2_D3_1(int rows, int cols,
__global short *mat_src1, int src1_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global short2 *src0_y = (__global short2 *)((__global uchar *)mat_src0 + y * src0_step);
@@ -880,8 +880,8 @@ __kernel void merge_vector_C2_D4_1(int rows, int cols,
__global int *mat_src1, int src1_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global int *src0_y = (__global int *)((__global uchar *)mat_src0 + y * src0_step);
@@ -904,8 +904,8 @@ __kernel void merge_vector_C2_D5_1(int rows, int cols,
__global float *mat_src1, int src1_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global float *src0_y = (__global float *)((__global uchar *)mat_src0 + y * src0_step);
@@ -915,7 +915,7 @@ __kernel void merge_vector_C2_D5_1(int rows, int cols,
float value1 = src0_y[x];
float value2 = src1_y[x];
dst_y[x] = (float2)(value1, value2);
dst_y[x] = (float2)(value1, value2);
}
}
@@ -926,8 +926,8 @@ __kernel void merge_vector_C2_D6_1(int rows, int cols,
__global double *mat_src1, int src1_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global double *src0_y = (__global double *)((__global uchar *)mat_src0 + y * src0_step);
@@ -949,8 +949,8 @@ __kernel void merge_vector_C3_D0_1(int rows, int cols,
__global uchar *mat_src2, int src2_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global uchar4 *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
@@ -981,8 +981,8 @@ __kernel void merge_vector_C3_D1_1(int rows, int cols,
__global char *mat_src2, int src2_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global char4 *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
@@ -1027,8 +1027,8 @@ __kernel void merge_vector_C3_D2_1(int rows, int cols,
__global ushort *mat_src2, int src2_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global ushort2 *src0_y = (__global ushort2 * )((__global char *)mat_src0 + y * src0_step);
@@ -1054,8 +1054,8 @@ __kernel void merge_vector_C3_D3_1(int rows, int cols,
__global short *mat_src2, int src2_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global short2 *src0_y = (__global short2 * )((__global char *)mat_src0 + y * src0_step);
@@ -1091,8 +1091,8 @@ __kernel void merge_vector_C3_D4_1(int rows, int cols,
__global int *mat_src2, int src2_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global int *src0_y = (__global int * )((__global char *)mat_src0 + y * src0_step);
@@ -1123,8 +1123,8 @@ __kernel void merge_vector_C3_D5_1(int rows, int cols,
__global float *mat_src2, int src2_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global float *src0_y = (__global float * )((__global char *)mat_src0 + y * src0_step);
@@ -1151,8 +1151,8 @@ __kernel void merge_vector_C3_D6_1(int rows, int cols,
__global double *mat_src2, int src2_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global double *src0_y = (__global double * )((__global char *)mat_src0 + y * src0_step);
@@ -1179,8 +1179,8 @@ __kernel void merge_vector_C4_D0_1(int rows, int cols,
__global uchar *mat_src3, int src3_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global uchar4 *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
@@ -1196,7 +1196,7 @@ __kernel void merge_vector_C4_D0_1(int rows, int cols,
uchar4 value3 = src3_y[x];
dst_y[x] = (uchar16)(value0.x, value1.x, value2.x, value3.x,
value0.y, value1.y, value2.y, value3.y,
value0.y, value1.y, value2.y, value3.y,
value0.z, value1.z, value2.z, value3.z,
value0.w, value1.w, value2.w, value3.w);
}
@@ -1210,8 +1210,8 @@ __kernel void merge_vector_C4_D1_1(int rows, int cols,
__global char *mat_src3, int src3_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global char4 *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
@@ -1227,7 +1227,7 @@ __kernel void merge_vector_C4_D1_1(int rows, int cols,
char4 value3 = src3_y[x];
dst_y[x] = (char16)(value0.x, value1.x, value2.x, value3.x,
value0.y, value1.y, value2.y, value3.y,
value0.y, value1.y, value2.y, value3.y,
value0.z, value1.z, value2.z, value3.z,
value0.w, value1.w, value2.w, value3.w);
}
@@ -1240,8 +1240,8 @@ __kernel void merge_vector_C4_D2_1(int rows, int cols,
__global ushort *mat_src3, int src3_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global ushort2 *src0_y = (__global ushort2 * )((__global uchar*)mat_src0 + y * src0_step);
@@ -1257,7 +1257,7 @@ __kernel void merge_vector_C4_D2_1(int rows, int cols,
ushort2 value3 = src3_y[x];
dst_y[x] = (ushort8)(value0.x, value1.x, value2.x, value3.x,
value0.y, value1.y, value2.y, value3.y);
value0.y, value1.y, value2.y, value3.y);
}
}
__kernel void merge_vector_C4_D3_1(int rows, int cols,
@@ -1268,8 +1268,8 @@ __kernel void merge_vector_C4_D3_1(int rows, int cols,
__global short *mat_src3, int src3_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global short2 *src0_y = (__global short2 * )((__global uchar*)mat_src0 + y * src0_step);
@@ -1285,7 +1285,7 @@ __kernel void merge_vector_C4_D3_1(int rows, int cols,
short2 value3 = src3_y[x];
dst_y[x] = (short8)(value0.x, value1.x, value2.x, value3.x,
value0.y, value1.y, value2.y, value3.y);
value0.y, value1.y, value2.y, value3.y);
}
}
__kernel void merge_vector_C4_D4_1(int rows, int cols,
@@ -1296,8 +1296,8 @@ __kernel void merge_vector_C4_D4_1(int rows, int cols,
__global int *mat_src3, int src3_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global int *src0_y = (__global int * )((__global uchar*)mat_src0 + y * src0_step);
@@ -1323,8 +1323,8 @@ __kernel void merge_vector_C4_D5_1(int rows, int cols,
__global float *mat_src3, int src3_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global float *src0_y = (__global float * )((__global uchar*)mat_src0 + y * src0_step);
@@ -1352,8 +1352,8 @@ __kernel void merge_vector_C4_D6_1(int rows, int cols,
__global double *mat_src3, int src3_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
if ((x < cols) && (y < rows))
{
__global double *src0_y = (__global double * )((__global uchar*)mat_src0 + y * src0_step);

View File

@@ -210,7 +210,7 @@ __kernel void icvCalcLayerDetAndTrace(
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
}
}
@@ -246,9 +246,9 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
// Non-maximal suppression to further filtering the candidates from previous step
__kernel
void icvFindMaximaInLayer_withmask(
__global const float * det,
__global const float * trace,
__global int4 * maxPosBuffer,
__global const float * det,
__global const float * trace,
__global int4 * maxPosBuffer,
volatile __global unsigned int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
@@ -288,26 +288,26 @@ __kernel
// Is this thread within the hessian buffer?
const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
N9[localLin - zoff] =
det[det_step *
N9[localLin - zoff] =
det[det_step *
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin ] =
det[det_step *
N9[localLin ] =
det[det_step *
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin + zoff] =
det[det_step *
N9[localLin + zoff] =
det[det_step *
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin
if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
&& get_local_id(0) > 0
&& get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0
&& get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
@@ -372,9 +372,9 @@ __kernel
__kernel
void icvFindMaximaInLayer(
__global float * det,
__global float * trace,
__global int4 * maxPosBuffer,
__global float * det,
__global float * trace,
__global int4 * maxPosBuffer,
volatile __global unsigned int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
@@ -417,19 +417,19 @@ __kernel
int l_x = min(max(j, 0), c_img_cols - 1);
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
N9[localLin - zoff] =
N9[localLin - zoff] =
det[det_step * (l_y - c_layer_rows) + l_x];
N9[localLin ] =
N9[localLin ] =
det[det_step * (l_y ) + l_x];
N9[localLin + zoff] =
N9[localLin + zoff] =
det[det_step * (l_y + c_layer_rows) + l_x];
barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin
if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
&& get_local_id(0) > 0
&& get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0
&& get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
@@ -497,17 +497,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
{
F invdet = 1.0 / det;
x[0] = invdet *
x[0] = invdet *
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
x[1] = invdet *
x[1] = invdet *
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
x[2] = invdet *
x[2] = invdet *
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
@@ -528,9 +528,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
__kernel
__kernel
void icvInterpolateKeypoint(
__global const float * det,
__global const float * det,
__global const int4 * maxPosBuffer,
__global float * keypoints,
volatile __global unsigned int * featureCounter,
@@ -560,7 +560,7 @@ __kernel
volatile __local float N9[3][3][3];
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
det[det_step * (c_layer_rows * layer + i) + j];
barrier(CLK_LOCAL_MEM_FENCE);
@@ -658,27 +658,27 @@ __kernel
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
0.001707611023448408f, 0.001455130288377404f};
@@ -691,13 +691,13 @@ void reduce_32_sum(volatile __local float * data, float partial_reduction, int
data[tid] = partial_reduction;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
if (tid < 16)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
#undef op
}
@@ -758,7 +758,7 @@ __kernel
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
angle = atan2(Y, X);
if (angle < 0)
angle += 2.0f * CV_PI_F;
angle *= 180.0f / CV_PI_F;
@@ -769,7 +769,7 @@ __kernel
s_Y[tid] = Y;
s_angle[tid] = angle;
barrier(CLK_LOCAL_MEM_FENCE);
float bestx = 0, besty = 0, best_mod = 0;
#pragma unroll
@@ -881,8 +881,8 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
// utility for linear filter
inline uchar readerGet(
image2d_t src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
image2d_t src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j
)
{
@@ -892,8 +892,8 @@ inline uchar readerGet(
}
inline float linearFilter(
image2d_t src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
image2d_t src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x
)
{
@@ -927,9 +927,9 @@ void calc_dx_dy(
volatile __local float s_dx_bin[25],
volatile __local float s_dy_bin[25],
volatile __local float s_PATCH[6][6],
__global const float* featureX,
__global const float* featureY,
__global const float* featureSize,
__global const float* featureX,
__global const float* featureY,
__global const float* featureSize,
__global const float* featureDir
)
{
@@ -976,26 +976,26 @@ void calc_dx_dy(
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
* dw;
const float vy = (
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
* dw;
s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
}
}
void reduce_sum25(
volatile __local float* sdata1,
volatile __local float* sdata2,
volatile __local float* sdata3,
volatile __local float* sdata4,
volatile __local float* sdata1,
volatile __local float* sdata2,
volatile __local float* sdata3,
volatile __local float* sdata4,
int tid
)
{
@@ -1033,10 +1033,10 @@ void reduce_sum25(
}
}
__kernel
__kernel
void compute_descriptors64(
image2d_t imgTex,
volatile __global float * descriptors,
volatile __global float * descriptors,
__global const float * keypoints,
int descriptors_step,
int keypoints_step
@@ -1083,10 +1083,10 @@ __kernel
}
}
}
__kernel
__kernel
void compute_descriptors128(
image2d_t imgTex,
__global volatile float * descriptors,
__global volatile float * descriptors,
__global float * keypoints,
int descriptors_step,
int keypoints_step
@@ -1178,7 +1178,7 @@ __kernel
}
}
__kernel
__kernel
void normalize_descriptors128(__global float * descriptors, int descriptors_step)
{
descriptors_step /= sizeof(*descriptors);
@@ -1219,7 +1219,7 @@ __kernel
// normalize and store in output
descriptor_base[get_local_id(0)] = lookup / len;
}
__kernel
__kernel
void normalize_descriptors64(__global float * descriptors, int descriptors_step)
{
descriptors_step /= sizeof(*descriptors);

View File

@@ -54,10 +54,10 @@
//----------------------------------------------------------------------------
// Histogram computation
__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y,
const int cnbins, const int cblock_hist_size, const int img_block_width,
const int grad_quadstep, const int qangle_step,
__global const float* grad, __global const uchar* qangle,
__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y,
const int cnbins, const int cblock_hist_size, const int img_block_width,
const int grad_quadstep, const int qangle_step,
__global const float* grad, __global const uchar* qangle,
const float scale, __global float* block_hists, __local float* smem)
{
const int lidX = get_local_id(0);
@@ -213,10 +213,10 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
products[tid] = product;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) products[tid] = product = product + products[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64) products[tid] = product = product + products[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
@@ -240,12 +240,12 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
__global const float* block_hists, __global float* descriptors)
__global const float* block_hists, __global float* descriptors)
{
int tid = get_local_id(0);
int gidX = get_group_id(0);
int gidY = get_group_id(1);
// Get left top corner of the window in src
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
@@ -261,7 +261,7 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
}
}
__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
{
@@ -291,8 +291,8 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
//----------------------------------------------------------------------------
// Gradients computation
__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
const __global uchar4 * img, __global float * grad, __global uchar * qangle,
__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
const __global uchar4 * img, __global float * grad, __global uchar * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);
@@ -391,7 +391,7 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
}
__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
__global const uchar * img, __global float * grad, __global uchar * qangle,
__global const uchar * img, __global float * grad, __global uchar * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);
@@ -453,37 +453,37 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
// Resize
__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
int dst_offset, int src_offset, int dst_step, int src_step,
int dst_offset, int src_offset, int dst_step, int src_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
int sx = (int)floor(dx*ifx+0.5f);
int sy = (int)floor(dy*ify+0.5f);
sx = min(sx, src_cols-1);
sy = min(sy, src_rows-1);
int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
if(dx<dst_cols && dy<dst_rows)
dst[dpos] = src[spos];
}
__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
int dst_offset, int src_offset, int dst_step, int src_step,
int dst_offset, int src_offset, int dst_step, int src_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
int sx = (int)floor(dx*ifx+0.5f);
int sy = (int)floor(dy*ify+0.5f);
sx = min(sx, src_cols-1);
sy = min(sy, src_rows-1);
int dpos = dst_offset + dy * dst_step + dx;
int spos = src_offset + sy * src_step + sx;
if(dx<dst_cols && dy<dst_rows)
dst[dpos] = src[spos];
}

View File

@@ -37,348 +37,348 @@
#define F2 float2
#define F4 float4
__kernel void convert_to_S4_C1_D0(
__global const int* restrict srcMat,
__global uchar* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const int* restrict srcMat,
__global uchar* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
int off_src = (dstoffset_in_pixel & 3);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
if(x+3<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
}
}
int x=get_global_id(0)<<2;
int y=get_global_id(1);
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
int off_src = (dstoffset_in_pixel & 3);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
if(x+3<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
}
}
}
__kernel void convert_to_S4_C4_D0(
__global const int4* restrict srcMat,
__global uchar4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const int4* restrict srcMat,
__global uchar4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S5_C1_D0(
__global const float* restrict srcMat,
__global uchar* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const float* restrict srcMat,
__global uchar* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
int off_src = (dstoffset_in_pixel & 3);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = vload4(0,srcMat+srcidx);
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
if(x+3<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = vload2(0,srcMat+srcidx);
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
}
}
int x=get_global_id(0)<<2;
int y=get_global_id(1);
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
int off_src = (dstoffset_in_pixel & 3);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = vload4(0,srcMat+srcidx);
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
if(x+3<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = vload2(0,srcMat+srcidx);
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
}
}
}
__kernel void convert_to_S5_C4_D0(
__global const float4* restrict srcMat,
__global uchar4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const float4* restrict srcMat,
__global uchar4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S0_C1_D4(
__global const uchar* restrict srcMat,
__global int* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const uchar* restrict srcMat,
__global int* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S5_C1_D4(
__global const float* restrict srcMat,
__global int* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const float* restrict srcMat,
__global int* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S0_C4_D4(
__global const uchar4* restrict srcMat,
__global int4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const uchar4* restrict srcMat,
__global int4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S5_C4_D4(
__global const float4* restrict srcMat,
__global int4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const float4* restrict srcMat,
__global int4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S0_C1_D5(
__global const uchar* restrict srcMat,
__global float* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const uchar* restrict srcMat,
__global float* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
}
__kernel void convert_to_S4_C1_D5(
__global const int* restrict srcMat,
__global float* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const int* restrict srcMat,
__global float* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
}
__kernel void convert_to_S0_C4_D5(
__global const uchar4* restrict srcMat,
__global float4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const uchar4* restrict srcMat,
__global float4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
}
__kernel void convert_to_S4_C4_D5(
__global const int4* restrict srcMat,
__global float4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__global const int4* restrict srcMat,
__global float4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
}

View File

@@ -35,28 +35,28 @@
//
__kernel void copy_to_with_mask(
__global const GENTYPE* restrict srcMat,
__global GENTYPE* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
__global const GENTYPE* restrict srcMat,
__global GENTYPE* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
x = x< cols ? x: cols-1;
y = y< rows ? y: rows-1;
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if (mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
int x=get_global_id(0);
int y=get_global_id(1);
x = x< cols ? x: cols-1;
y = y< rows ? y: rows-1;
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if (mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}

View File

@@ -38,53 +38,53 @@
__kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
//int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
//int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
uchar4 out;
out.x = out.y = out.z = out.w = scalar;
if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
{
*(__global uchar4*)(dstMat+idx) = out;
}
else
{
if((x+3 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
dstMat[idx+2] = out.z;
dstMat[idx+3] = out.w;
}
if((x+2 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
dstMat[idx+2] = out.z;
}
else if((x+1 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
}
else if((x < cols) && (y < rows))
{
dstMat[idx] = out.x;
}
}
int x=get_global_id(0)<<2;
int y=get_global_id(1);
//int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
//int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
uchar4 out;
out.x = out.y = out.z = out.w = scalar;
if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
{
*(__global uchar4*)(dstMat+idx) = out;
}
else
{
if((x+3 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
dstMat[idx+2] = out.z;
dstMat[idx+3] = out.w;
}
if((x+2 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
dstMat[idx+2] = out.z;
}
else if((x+1 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
}
else if((x < cols) && (y < rows))
{
dstMat[idx] = out.x;
}
}
}
__kernel void set_to_without_mask(GENTYPE scalar,__global GENTYPE * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = scalar;
}
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = scalar;
}
}

View File

@@ -34,27 +34,27 @@
//
//
__kernel void set_to_with_mask(
GENTYPE scalar,
__global GENTYPE * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
GENTYPE scalar,
__global GENTYPE * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
x = x< cols ? x: cols-1;
y = y< rows ? y: rows-1;
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if (mask)
{
dstMat[dstidx] = scalar;
}
int x=get_global_id(0);
int y=get_global_id(1);
x = x< cols ? x: cols-1;
y = y< rows ? y: rows-1;
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if (mask)
{
dstMat[dstidx] = scalar;
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -75,7 +75,7 @@ __kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcSt
const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
const uchar src_val1 = (src + y * srcStep)[x];
const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
}
@@ -91,7 +91,7 @@ __kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcSt
const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
const uchar src_val1 = (src + y * srcStep)[x];
const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
}
@@ -209,20 +209,20 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f
smem3[tid] = val3;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
smem3[tid] = val3 += smem3[tid + 128];
}
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
smem3[tid] = val3 += smem3[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
smem3[tid] = val3 += smem3[tid + 64];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
@@ -231,28 +231,28 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f
volatile __local float* vmem2 = smem2;
volatile __local float* vmem3 = smem3;
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
vmem3[tid] = val3 += vmem3[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
vmem3[tid] = val3 += vmem3[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem3[tid] = val3 += vmem3[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4];
vmem3[tid] = val3 += vmem3[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem3[tid] = val3 += vmem3[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
vmem3[tid] = val3 += vmem3[tid + 1];
}
}
@@ -263,18 +263,18 @@ void reduce2(float val1, float val2, __local float* smem1, __local float* smem2,
smem2[tid] = val2;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
}
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
}
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
@@ -282,23 +282,23 @@ void reduce2(float val1, float val2, __local float* smem1, __local float* smem2,
volatile __local float* vmem1 = smem1;
volatile __local float* vmem2 = smem2;
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
}
}
@@ -307,28 +307,28 @@ void reduce1(float val1, __local float* smem1, int tid)
smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
}
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
}
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
{
volatile __local float* vmem1 = smem1;
vmem1[tid] = val1 += vmem1[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
}
}
@@ -344,8 +344,8 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
__local float smem2[256];
__local float smem3[256];
int c_halfWin_x = (c_winSize_x - 1) / 2;
int c_halfWin_y = (c_winSize_y - 1) / 2;
int c_halfWin_x = (c_winSize_x - 1) / 2;
int c_halfWin_y = (c_winSize_y - 1) / 2;
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@@ -359,18 +359,18 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
{
status[get_group_id(0)] = 0;
//if (calcErr)
//if (calcErr)
// err[get_group_id(0)] = 0;
}
return;
}
prevPt.x -= c_halfWin_x;
prevPt.y -= c_halfWin_y;
// extract the patch from the first image, compute covariation matrix of derivatives
float A11 = 0;
float A12 = 0;
float A22 = 0;
@@ -380,14 +380,14 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
float dIdy_patch[21][21];
for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i)
{
{
for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j)
{
float x = (prevPt.x + xBase + 0.5f);
float y = (prevPt.y + yBase + 0.5f);
I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x;
float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
(3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
@@ -396,7 +396,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy;
A11 += dIdx * dIdx;
A12 += dIdx * dIdy;
A22 += dIdy * dIdy;
@@ -409,10 +409,10 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
A11 = smem1[0];
A12 = smem2[0];
A22 = smem3[0];
float D = A11 * A22 - A12 * A12;
//if (calcErr && GET_MIN_EIGENVALS && tid == 0)
//if (calcErr && GET_MIN_EIGENVALS && tid == 0)
// err[get_group_id(0)] = minEig;
if (D < 1.192092896e-07f)
@@ -431,8 +431,8 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
float2 nextPt = nextPts[get_group_id(0)];
nextPt.x *= 2.0f;
nextPt.y *= 2.0f;
nextPt.y *= 2.0f;
nextPt.x -= c_halfWin_x;
nextPt.y -= c_halfWin_y;
@@ -447,14 +447,14 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
float b1 = 0;
float b2 = 0;
for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
{
for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
{
float a = (nextPt.x + x + 0.5f);
float b = (nextPt.y + y + 0.5f);
float a = (nextPt.x + x + 0.5f);
float b = (nextPt.y + y + 0.5f);
float I_val = I_patch[i][j];
float J_val = read_imagef(J, sampler, (float2)(a, b)).x;
@@ -464,7 +464,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
b2 += diff * dIdy_patch[i][j];
}
}
reduce2(b1, b2, smem1, smem2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
@@ -474,7 +474,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
nextPt.x += delta.x;
nextPt.y += delta.y;
@@ -489,9 +489,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
{
for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
{
float a = (nextPt.x + x + 0.5f);
float b = (nextPt.y + y + 0.5f);
float a = (nextPt.x + x + 0.5f);
float b = (nextPt.y + y + 0.5f);
float I_val = I_patch[i][j];
float J_val = read_imagef(J, sampler, (float2)(a, b)).x;
@@ -522,8 +522,8 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
__local float smem2[256];
__local float smem3[256];
int c_halfWin_x = (c_winSize_x - 1) / 2;
int c_halfWin_y = (c_winSize_y - 1) / 2;
int c_halfWin_x = (c_winSize_x - 1) / 2;
int c_halfWin_y = (c_winSize_y - 1) / 2;
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@@ -537,18 +537,18 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
{
status[get_group_id(0)] = 0;
//if (calcErr)
//if (calcErr)
// err[get_group_id(0)] = 0;
}
return;
}
prevPt.x -= c_halfWin_x;
prevPt.y -= c_halfWin_y;
// extract the patch from the first image, compute covariation matrix of derivatives
float A11 = 0;
float A12 = 0;
float A22 = 0;
@@ -558,14 +558,14 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
float4 dIdy_patch[21][21];
for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i)
{
{
for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j)
{
float x = (prevPt.x + xBase + 0.5f);
float y = (prevPt.y + yBase + 0.5f);
I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x;
float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
(3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
@@ -574,7 +574,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy;
A11 += (dIdx * dIdx).x + (dIdx * dIdx).y + (dIdx * dIdx).z;
A12 += (dIdx * dIdy).x + (dIdx * dIdy).y + (dIdx * dIdy).z;
A22 += (dIdy * dIdy).x + (dIdy * dIdy).y + (dIdy * dIdy).z;
@@ -587,10 +587,10 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
A11 = smem1[0];
A12 = smem2[0];
A22 = smem3[0];
float D = A11 * A22 - A12 * A12;
//if (calcErr && GET_MIN_EIGENVALS && tid == 0)
//if (calcErr && GET_MIN_EIGENVALS && tid == 0)
// err[get_group_id(0)] = minEig;
if (D < 1.192092896e-07f)
@@ -609,8 +609,8 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
float2 nextPt = nextPts[get_group_id(0)];
nextPt.x *= 2.0f;
nextPt.y *= 2.0f;
nextPt.y *= 2.0f;
nextPt.x -= c_halfWin_x;
nextPt.y -= c_halfWin_y;
@@ -625,14 +625,14 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
float b1 = 0;
float b2 = 0;
for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
{
for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
{
float a = (nextPt.x + x + 0.5f);
float b = (nextPt.y + y + 0.5f);
float a = (nextPt.x + x + 0.5f);
float b = (nextPt.y + y + 0.5f);
float4 I_val = I_patch[i][j];
float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x;
@@ -642,7 +642,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
b2 += (diff * dIdy_patch[i][j]).x + (diff * dIdy_patch[i][j]).y + (diff * dIdy_patch[i][j]).z;
}
}
reduce2(b1, b2, smem1, smem2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
@@ -652,7 +652,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
nextPt.x += delta.x;
nextPt.y += delta.y;
@@ -667,9 +667,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
{
for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
{
float a = (nextPt.x + x + 0.5f);
float b = (nextPt.y + y + 0.5f);
float a = (nextPt.x + x + 0.5f);
float b = (nextPt.y + y + 0.5f);
float4 I_val = I_patch[i][j];
float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x;
@@ -694,11 +694,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
}
}
__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
{
int c_halfWin_x = (c_winSize_x - 1) / 2;
int c_halfWin_y = (c_winSize_y - 1) / 2;
int c_halfWin_x = (c_winSize_x - 1) / 2;
int c_halfWin_y = (c_winSize_y - 1) / 2;
const int patchWidth = get_local_size(0) + 2 * c_halfWin_x;
const int patchHeight = get_local_size(1) + 2 * c_halfWin_y;
@@ -712,8 +712,8 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
const int xBase = get_group_id(0) * get_local_size(0);
const int yBase = get_group_id(1) * get_local_size(1);
sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1))
{
for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0))
@@ -735,7 +735,7 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
barrier(CLK_LOCAL_MEM_FENCE);
// extract the patch from the first image, compute covariation matrix of derivatives
const int x = get_global_id(0);
const int y = get_global_id(1);
@@ -747,24 +747,24 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
int A22i = 0;
for (int i = 0; i < c_winSize_y; ++i)
{
{
for (int j = 0; j < c_winSize_x; ++j)
{
int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
A11i += dIdx * dIdx;
A12i += dIdx * dIdy;
A22i += dIdy * dIdy;
}
}
float A11 = A11i;
float A12 = A12i;
float A22 = A22i;
float D = A11 * A22 - A12 * A12;
//if (calcErr && GET_MIN_EIGENVALS)
// (err + y * errStep)[x] = minEig;
@@ -819,7 +819,7 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
nextPt.x += delta.x;
nextPt.y += delta.y;

View File

@@ -51,9 +51,9 @@
////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
__global uchar *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
@@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 2;
int src_idx = mad24(y, src_step, src_offset + (x << 2));
int src_idx = mad24(y, src_step, src_offset + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
int total_bytes = src_offset + rows * src_step;
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
int total_bytes = src_offset + rows * src_step;
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
@@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s
}
__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 2;
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
@@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s
uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3;
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
uchar4 data0, data1, data2;
data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -263,31 +263,31 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s
}
__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
@@ -312,9 +312,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int s
}
__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst2, int dst2_step, int dst2_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst2, int dst2_step, int dst2_offset,
__global char *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
@@ -322,35 +322,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 2;
int src_idx = mad24(y, src_step, src_offset + (x << 2));
int src_idx = mad24(y, src_step, src_offset + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
@@ -423,33 +423,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr
}
__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst2, int dst2_step, int dst2_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 2;
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
@@ -486,10 +486,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr
char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3;
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
char4 data0, data1, data2;
data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -522,31 +522,31 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr
}
__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
char8 src_data_0 = vload8(0, mat_src + src_idx_0);
char8 src_data_1 = vload8(0, mat_src + src_idx_1);
@@ -571,9 +571,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int sr
}
__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
__global ushort *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
@@ -581,29 +581,29 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 1;
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
ushort8 src_data0 = vload8(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
@@ -639,33 +639,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int
}
__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 1;
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -702,31 +702,31 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int
}
__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_1));
@@ -746,9 +746,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int
}
}
__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst2, int dst2_step, int dst2_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst2, int dst2_step, int dst2_offset,
__global short *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
@@ -756,29 +756,29 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 1;
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
short8 src_data0 = vload8(0, (__global short *)((__global char *)mat_src + src_idx_0));
short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
@@ -813,33 +813,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s
}
}
__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst2, int dst2_step, int dst2_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 1;
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -877,31 +877,31 @@ __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int s
__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
@@ -921,9 +921,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int s
}
}
__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst2, int dst2_step, int dst2_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst2, int dst2_step, int dst2_offset,
__global int *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
@@ -931,14 +931,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -948,18 +948,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src
}
}
__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst2, int dst2_step, int dst2_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -975,20 +975,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src
}
__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -997,9 +997,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src
}
__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst2, int dst2_step, int dst2_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst2, int dst2_step, int dst2_offset,
__global float *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
@@ -1007,14 +1007,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1025,18 +1025,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s
}
__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst2, int dst2_step, int dst2_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1052,20 +1052,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int s
}
__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1075,9 +1075,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int s
#if defined (DOUBLE_SUPPORT)
__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst2, int dst2_step, int dst2_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst2, int dst2_step, int dst2_offset,
__global double *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
@@ -1085,14 +1085,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1103,18 +1103,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int
}
__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst2, int dst2_step, int dst2_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1130,20 +1130,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int
}
__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < cols) && (y < rows))
if((x < cols) && (y < rows))
{
int src_idx = mad24(y, src_step, src_offset);
int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;