Normalize line endings and whitespace
This commit is contained in:
committed by
Andrey Kamaev
parent
69020da607
commit
04384a71e4
@@ -40,123 +40,123 @@
|
||||
|
||||
__kernel
|
||||
void LUT_C1_D0( __global uchar *dst,
|
||||
__global const uchar *src,
|
||||
__constant uchar *table,
|
||||
int rows,
|
||||
int cols,
|
||||
int channels,
|
||||
int whole_rows,
|
||||
int whole_cols,
|
||||
int src_offset,
|
||||
int dst_offset,
|
||||
int lut_offset,
|
||||
int src_step,
|
||||
int dst_step)
|
||||
__global const uchar *src,
|
||||
__constant uchar *table,
|
||||
int rows,
|
||||
int cols,
|
||||
int channels,
|
||||
int whole_rows,
|
||||
int whole_cols,
|
||||
int src_offset,
|
||||
int dst_offset,
|
||||
int lut_offset,
|
||||
int src_step,
|
||||
int dst_step)
|
||||
{
|
||||
int gidx = get_global_id(0)<<2;
|
||||
int gidy = get_global_id(1);
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
int gidx = get_global_id(0)<<2;
|
||||
int gidy = get_global_id(1);
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
|
||||
__local uchar l[256];
|
||||
l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
|
||||
//mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
__local uchar l[256];
|
||||
l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
|
||||
//mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
//clamp(gidx,mask,cols-1);
|
||||
gidx = gidx >= cols-4?cols-4:gidx;
|
||||
gidy = gidy >= rows?rows-1:gidy;
|
||||
//clamp(gidx,mask,cols-1);
|
||||
gidx = gidx >= cols-4?cols-4:gidx;
|
||||
gidy = gidy >= rows?rows-1:gidy;
|
||||
|
||||
int src_index = src_offset + mad24(gidy,src_step,gidx);
|
||||
int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
|
||||
uchar4 p,q;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
p.x = src[src_index];
|
||||
p.y = src[src_index+1];
|
||||
p.z = src[src_index+2];
|
||||
p.w = src[src_index+3];
|
||||
int src_index = src_offset + mad24(gidy,src_step,gidx);
|
||||
int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
|
||||
uchar4 p,q;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
p.x = src[src_index];
|
||||
p.y = src[src_index+1];
|
||||
p.z = src[src_index+2];
|
||||
p.w = src[src_index+3];
|
||||
|
||||
q.x = l[p.x];
|
||||
q.y = l[p.y];
|
||||
q.z = l[p.z];
|
||||
q.w = l[p.w];
|
||||
*(__global uchar4*)(dst + dst_index) = q;
|
||||
q.x = l[p.x];
|
||||
q.y = l[p.y];
|
||||
q.z = l[p.z];
|
||||
q.w = l[p.w];
|
||||
*(__global uchar4*)(dst + dst_index) = q;
|
||||
}
|
||||
|
||||
__kernel
|
||||
void LUT2_C1_D0( __global uchar *dst,
|
||||
__global const uchar *src,
|
||||
__constant uchar *table,
|
||||
int rows,
|
||||
int precols,
|
||||
int channels,
|
||||
int whole_rows,
|
||||
int cols,
|
||||
int src_offset,
|
||||
int dst_offset,
|
||||
int lut_offset,
|
||||
int src_step,
|
||||
int dst_step)
|
||||
__global const uchar *src,
|
||||
__constant uchar *table,
|
||||
int rows,
|
||||
int precols,
|
||||
int channels,
|
||||
int whole_rows,
|
||||
int cols,
|
||||
int src_offset,
|
||||
int dst_offset,
|
||||
int lut_offset,
|
||||
int src_step,
|
||||
int dst_step)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
//int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
//int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
|
||||
__local uchar l[256];
|
||||
l[lidy] = table[lidy+lut_offset];
|
||||
//mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
__local uchar l[256];
|
||||
l[lidy] = table[lidy+lut_offset];
|
||||
//mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
//clamp(gidx,mask,cols-1);
|
||||
gidx = gidx >= precols ? cols+gidx : gidx;
|
||||
gidy = gidy >= rows?rows-1:gidy;
|
||||
//clamp(gidx,mask,cols-1);
|
||||
gidx = gidx >= precols ? cols+gidx : gidx;
|
||||
gidy = gidy >= rows?rows-1:gidy;
|
||||
|
||||
int src_index = src_offset + mad24(gidy,src_step,gidx);
|
||||
int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
|
||||
//uchar4 p,q;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uchar p = src[src_index];
|
||||
uchar q = l[p];
|
||||
dst[dst_index] = q;
|
||||
int src_index = src_offset + mad24(gidy,src_step,gidx);
|
||||
int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
|
||||
//uchar4 p,q;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uchar p = src[src_index];
|
||||
uchar q = l[p];
|
||||
dst[dst_index] = q;
|
||||
}
|
||||
|
||||
__kernel
|
||||
void LUT_C4_D0( __global uchar4 *dst,
|
||||
__global uchar4 *src,
|
||||
__constant uchar *table,
|
||||
int rows,
|
||||
int cols,
|
||||
int channels,
|
||||
int whole_rows,
|
||||
int whole_cols,
|
||||
int src_offset,
|
||||
int dst_offset,
|
||||
int lut_offset,
|
||||
int src_step,
|
||||
int dst_step)
|
||||
__global uchar4 *src,
|
||||
__constant uchar *table,
|
||||
int rows,
|
||||
int cols,
|
||||
int channels,
|
||||
int whole_rows,
|
||||
int whole_cols,
|
||||
int src_offset,
|
||||
int dst_offset,
|
||||
int lut_offset,
|
||||
int src_step,
|
||||
int dst_step)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
|
||||
int src_index = mad24(gidy,src_step,gidx+src_offset);
|
||||
int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
__local uchar l[256];
|
||||
l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
|
||||
//mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
|
||||
if(gidx<cols && gidy<rows)
|
||||
{
|
||||
uchar4 p = src[src_index];
|
||||
uchar4 q;
|
||||
q.x = l[p.x];
|
||||
q.y = l[p.y];
|
||||
q.z = l[p.z];
|
||||
q.w = l[p.w];
|
||||
dst[dst_index] = q;
|
||||
}
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
|
||||
int src_index = mad24(gidy,src_step,gidx+src_offset);
|
||||
int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
__local uchar l[256];
|
||||
l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
|
||||
//mem_fence(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(gidx<cols && gidy<rows)
|
||||
{
|
||||
uchar4 p = src[src_index];
|
||||
uchar4 q;
|
||||
q.x = l[p.x];
|
||||
q.y = l[p.y];
|
||||
q.z = l[p.z];
|
||||
q.w = l[p.w];
|
||||
dst[dst_index] = q;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,28 +64,28 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = abs_diff(src1_data, src2_data);
|
||||
@@ -112,8 +112,8 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -146,8 +146,8 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -250,20 +250,20 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
|
||||
@@ -289,7 +289,7 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -320,7 +320,7 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -423,7 +423,7 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -565,7 +565,7 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -575,9 +575,9 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
||||
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
|
||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
@@ -588,17 +588,17 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
|
||||
uchar4 tmp_data_2 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_2), src2_data_2));
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -619,7 +619,7 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -643,12 +643,12 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -669,7 +669,7 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -693,12 +693,12 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -716,7 +716,7 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
@@ -750,13 +750,13 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
|
||||
float src2_data_0 = src2.x;
|
||||
float src2_data_1 = src2.y;
|
||||
float src2_data_2 = src2.z;
|
||||
@@ -786,13 +786,13 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
double src2_data_0 = src2.x;
|
||||
double src2_data_1 = src2.y;
|
||||
double src2_data_2 = src2.z;
|
||||
|
||||
@@ -65,28 +65,28 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
|
||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
||||
@@ -113,8 +113,8 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -148,8 +148,8 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -253,38 +253,38 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
uchar4 mask_data = vload4(0, mask + mask_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(mask_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
|
||||
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(mask_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
|
||||
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
|
||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
||||
@@ -312,8 +312,8 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -349,8 +349,8 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -395,7 +395,7 @@ __kernel void arithm_add_with_mask_C1_D4 (__global int *src1, int src1_step, i
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = convert_int_sat((long)src_data1 + (long)src_data2);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -425,7 +425,7 @@ __kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, i
|
||||
float dst_data = *((__global float *)((__global char *)dst + dst_index));
|
||||
|
||||
float data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -456,7 +456,7 @@ __kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step,
|
||||
double dst_data = *((__global double *)((__global char *)dst + dst_index));
|
||||
|
||||
double data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -478,8 +478,8 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -525,7 +525,7 @@ __kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step,
|
||||
|
||||
int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2);
|
||||
ushort2 data = convert_ushort2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -555,7 +555,7 @@ __kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, i
|
||||
|
||||
int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2);
|
||||
short2 data = convert_short2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -584,7 +584,7 @@ __kernel void arithm_add_with_mask_C2_D4 (__global int *src1, int src1_step, i
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -613,7 +613,7 @@ __kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, i
|
||||
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
|
||||
|
||||
float2 data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -644,7 +644,7 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step,
|
||||
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
|
||||
|
||||
double2 data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -665,8 +665,8 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -692,17 +692,17 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
|
||||
uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2));
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -725,8 +725,8 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -753,12 +753,12 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -781,8 +781,8 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -809,12 +809,12 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -834,8 +834,8 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -878,15 +878,15 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
|
||||
float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
|
||||
float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
|
||||
float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
|
||||
@@ -924,15 +924,15 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
|
||||
double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
|
||||
double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
|
||||
@@ -981,7 +981,7 @@ __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = convert_uchar4_sat(convert_ushort4_sat(src_data1) + convert_ushort4_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1010,7 +1010,7 @@ __kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step,
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1039,7 +1039,7 @@ __kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, i
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = convert_short4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1068,7 +1068,7 @@ __kernel void arithm_add_with_mask_C4_D4 (__global int *src1, int src1_step, i
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1097,7 +1097,7 @@ __kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, i
|
||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
||||
|
||||
float4 data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1128,7 +1128,7 @@ __kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step,
|
||||
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
|
||||
|
||||
double4 data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
|
||||
@@ -61,30 +61,30 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data ,src2_data;
|
||||
uchar4 src1_data ,src2_data;
|
||||
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
|
||||
@@ -118,14 +118,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -164,14 +164,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -209,18 +209,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
|
||||
|
||||
#define dst_align ((dst_offset >> bitOfInt) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
|
||||
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
|
||||
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
|
||||
@@ -257,16 +257,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
@@ -305,16 +305,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
|
||||
|
||||
@@ -60,21 +60,21 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
int4 tmp = convert_int4_sat(src1_data) + src2_data;
|
||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
||||
@@ -100,7 +100,7 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -132,7 +132,7 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -234,7 +234,7 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -379,7 +379,7 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -389,9 +389,9 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
||||
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
|
||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
@@ -402,17 +402,17 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
|
||||
uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -433,7 +433,7 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -457,12 +457,12 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -483,7 +483,7 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -507,12 +507,12 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -530,7 +530,7 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
@@ -564,13 +564,13 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
|
||||
float src2_data_0 = src2.x;
|
||||
float src2_data_1 = src2.y;
|
||||
float src2_data_2 = src2.z;
|
||||
@@ -600,13 +600,13 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
double src2_data_0 = src2.x;
|
||||
double src2_data_1 = src2.y;
|
||||
double src2_data_2 = src2.z;
|
||||
|
||||
@@ -62,29 +62,29 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
|
||||
uchar4 mask_data = vload4(0, mask + mask_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(mask_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
|
||||
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 mask_data = vload4(0, mask + mask_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(mask_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
|
||||
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
int4 tmp = convert_int4_sat(src1_data) + src2_data;
|
||||
@@ -112,7 +112,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -147,7 +147,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -190,7 +190,7 @@ __kernel void arithm_s_add_with_mask_C1_D4 (__global int *src1, int src1_ste
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = convert_int_sat((long)src_data1 + (long)src_data2);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -218,7 +218,7 @@ __kernel void arithm_s_add_with_mask_C1_D5 (__global float *src1, int src1_s
|
||||
float dst_data = *((__global float *)((__global char *)dst + dst_index));
|
||||
|
||||
float data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -248,7 +248,7 @@ __kernel void arithm_s_add_with_mask_C1_D6 (__global double *src1, int src1_
|
||||
double dst_data = *((__global double *)((__global char *)dst + dst_index));
|
||||
|
||||
double data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -268,7 +268,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -307,12 +307,12 @@ __kernel void arithm_s_add_with_mask_C2_D2 (__global ushort *src1, int src1_st
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 tmp = convert_int2_sat(src_data1) + src_data2;
|
||||
ushort2 data = convert_ushort2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -335,12 +335,12 @@ __kernel void arithm_s_add_with_mask_C2_D3 (__global short *src1, int src1_ste
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 tmp = convert_int2_sat(src_data1) + src_data2;
|
||||
short2 data = convert_short2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -363,11 +363,11 @@ __kernel void arithm_s_add_with_mask_C2_D4 (__global int *src1, int src1_step,
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -390,11 +390,11 @@ __kernel void arithm_s_add_with_mask_C2_D5 (__global float *src1, int src1_ste
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
|
||||
float2 src_data2 = (float2)(src2.x, src2.y);
|
||||
float2 src_data2 = (float2)(src2.x, src2.y);
|
||||
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
|
||||
|
||||
float2 data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -419,11 +419,11 @@ __kernel void arithm_s_add_with_mask_C2_D6 (__global double *src1, int src1_st
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
|
||||
double2 src_data2 = (double2)(src2.x, src2.y);
|
||||
double2 src_data2 = (double2)(src2.x, src2.y);
|
||||
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
|
||||
|
||||
double2 data = src_data1 + src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -444,7 +444,7 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -470,17 +470,17 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
|
||||
uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -502,7 +502,7 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -513,9 +513,9 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
|
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
||||
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
@@ -529,12 +529,12 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -556,7 +556,7 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -567,9 +567,9 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
|
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
||||
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
@@ -583,12 +583,12 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -607,7 +607,7 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -615,9 +615,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
|
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int src2_data_0 = src2.x;
|
||||
int src2_data_0 = src2.x;
|
||||
int src2_data_1 = src2.y;
|
||||
int src2_data_2 = src2.z;
|
||||
int src2_data_2 = src2.z;
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
@@ -649,17 +649,17 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
float src2_data_0 = src2.x;
|
||||
|
||||
float src2_data_0 = src2.x;
|
||||
float src2_data_1 = src2.y;
|
||||
float src2_data_2 = src2.z;
|
||||
float src2_data_2 = src2.z;
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
@@ -693,17 +693,17 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
double src2_data_0 = src2.x;
|
||||
|
||||
double src2_data_0 = src2.x;
|
||||
double src2_data_1 = src2.y;
|
||||
double src2_data_2 = src2.z;
|
||||
double src2_data_2 = src2.z;
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
@@ -747,7 +747,7 @@ __kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_ste
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = convert_uchar4_sat(convert_int4_sat(src_data1) + src2);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -773,7 +773,7 @@ __kernel void arithm_s_add_with_mask_C4_D2 (__global ushort *src1, int src1_st
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + src2);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -799,7 +799,7 @@ __kernel void arithm_s_add_with_mask_C4_D3 (__global short *src1, int src1_ste
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = convert_short4_sat(convert_int4_sat(src_data1) + src2);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -825,7 +825,7 @@ __kernel void arithm_s_add_with_mask_C4_D4 (__global int *src1, int src1_step,
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -851,7 +851,7 @@ __kernel void arithm_s_add_with_mask_C4_D5 (__global float *src1, int src1_ste
|
||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
||||
|
||||
float4 data = src_data1 + src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -879,7 +879,7 @@ __kernel void arithm_s_add_with_mask_C4_D6 (__global double *src1, int src1_st
|
||||
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
|
||||
|
||||
double4 data = src_data1 + src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
|
||||
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
|
||||
@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
|
||||
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data & src2_data;
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data & src2_data;
|
||||
|
||||
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
|
||||
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_
|
||||
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
|
||||
char4 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort2 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
short2 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
|
||||
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
|
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
|
||||
char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
|
||||
char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
|
||||
char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
|
||||
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
|
||||
char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
|
||||
char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
|
||||
char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
|
||||
char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
|
||||
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
|
||||
char4 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
|
||||
char8 data_2 = src_data1_2 & src_data2_2;
|
||||
char8 data_3 = src_data1_3 & src_data2_3;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
|
||||
|
||||
@@ -64,7 +64,7 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -98,7 +98,7 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -132,7 +132,7 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -163,7 +163,7 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, i
|
||||
|
||||
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
|
||||
short4 tmp_data = src1_data & src2_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
@@ -269,7 +269,7 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -280,7 +280,7 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step,
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
|
||||
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
@@ -303,7 +303,7 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -311,10 +311,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step,
|
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index);
|
||||
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
|
||||
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
|
||||
@@ -339,7 +339,7 @@ __kernel void arithm_s_bitwise_and_C2_D2 (__global ushort *src1, int src1_step
|
||||
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
|
||||
|
||||
ushort2 data = src_data1 & src_data2;
|
||||
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_and_C2_D3 (__global short *src1, int src1_step,
|
||||
short2 src_data2 = (short2)(src2.x, src2.y);
|
||||
|
||||
short2 data = src_data1 & src_data2;
|
||||
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
@@ -401,7 +401,7 @@ __kernel void arithm_s_bitwise_and_C2_D5 (__global char *src1, int src1_step,
|
||||
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
char8 tmp_data = src1_data & src2_data;
|
||||
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
@@ -423,7 +423,7 @@ __kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, i
|
||||
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
short8 tmp_data = src1_data & src2_data;
|
||||
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
@@ -441,7 +441,7 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -451,9 +451,9 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
@@ -462,19 +462,19 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
|
||||
uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -497,7 +497,7 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -507,9 +507,9 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
|
||||
@@ -520,17 +520,17 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
|
||||
char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2));
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -552,7 +552,7 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -576,12 +576,12 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -602,7 +602,7 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -626,12 +626,12 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step,
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_and_C3_D4 (__global int *src1, int src1_step, i
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
@@ -683,16 +683,16 @@ __kernel void arithm_s_bitwise_and_C3_D5 (__global char *src1, int src1_step,
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
|
||||
@@ -718,13 +718,13 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
@@ -736,7 +736,7 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
|
||||
short4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
@@ -864,7 +864,7 @@ __kernel void arithm_s_bitwise_and_C4_D5 (__global char *src1, int src1_step,
|
||||
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
char16 tmp_data = src1_data & src2_data;
|
||||
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
@@ -891,17 +891,17 @@ __kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, i
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 & src2_data_3;
|
||||
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -66,7 +66,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -141,7 +141,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -154,7 +154,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int
|
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
||||
ushort2 tmp_data = src1_data & src2_data;
|
||||
|
||||
|
||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
||||
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
|
||||
|
||||
@@ -175,7 +175,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -217,7 +217,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global int *src1, int
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -245,7 +245,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src
|
||||
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
|
||||
char4 data = src1_data & src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -274,7 +274,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int sr
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src1_data & src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -330,7 +330,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int s
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -373,7 +373,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort2 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -400,7 +400,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global short *src1, int
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
short2 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -427,7 +427,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global int *src1, int sr
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = src_data1 & src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -454,7 +454,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global char *src1, int s
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src1_data & src2_data;
|
||||
|
||||
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int sr
|
||||
short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index));
|
||||
|
||||
short8 data = src1_data & src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -514,9 +514,9 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
@@ -529,17 +529,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
|
||||
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -563,7 +563,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -574,9 +574,9 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
@@ -587,19 +587,19 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
|
||||
char4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -622,7 +622,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -646,15 +646,15 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int
|
||||
ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -703,12 +703,12 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -727,7 +727,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global int *src1, int sr
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -769,18 +769,18 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global char *src1, int s
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
@@ -812,18 +812,18 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
|
||||
@@ -833,7 +833,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
|
||||
short4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = src_data1 & src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -893,7 +893,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global char *src1, int s
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
|
||||
char4 data = src_data1 & src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -920,7 +920,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = src_data1 & src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -946,7 +946,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global short *src1, int
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src_data1 & src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -972,7 +972,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global int *src1, int sr
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = src_data1 & src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1000,7 +1000,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global char *src1, int s
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src1_data & src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1032,7 +1032,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
|
||||
short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0));
|
||||
short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8));
|
||||
short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
|
||||
@@ -1042,10 +1042,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
|
||||
short4 data_1 = src1_data_1 & src2_data_1;
|
||||
short4 data_2 = src1_data_2 & src2_data_2;
|
||||
short4 data_3 = src1_data_3 & src2_data_3;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
|
||||
|
||||
@@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -95,7 +95,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -129,7 +129,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -164,7 +164,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -238,12 +238,12 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
|
||||
{
|
||||
int src_index = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
|
||||
char8 data;
|
||||
|
||||
data = *((__global char8 *)((__global char *)src + src_index));
|
||||
data = ~ data;
|
||||
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
|
||||
@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
|
||||
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data | src2_data;
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data | src2_data;
|
||||
|
||||
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
|
||||
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_s
|
||||
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
|
||||
char4 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_s
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort2 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
short2 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_s
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_s
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
|
||||
uchar4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
|
||||
char4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_s
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
|
||||
char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
|
||||
char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
|
||||
char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
|
||||
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_s
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
|
||||
char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
|
||||
char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
|
||||
char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
|
||||
char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
|
||||
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_s
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
|
||||
char4 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_s
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
|
||||
char8 data_2 = src_data1_2 | src_data2_2;
|
||||
char8 data_3 = src_data1_3 | src_data2_3;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
|
||||
|
||||
@@ -62,7 +62,7 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -96,7 +96,7 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, i
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -130,7 +130,7 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -161,7 +161,7 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -225,7 +225,7 @@ __kernel void arithm_s_bitwise_or_C1_D5 (__global char *src1, int src1_step, i
|
||||
__kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, in
|
||||
__kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -260,7 +260,7 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -283,7 +283,7 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -316,7 +316,7 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i
|
||||
__kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -338,7 +338,7 @@ __kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -381,7 +381,7 @@ __kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, in
|
||||
__kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -403,7 +403,7 @@ __kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, i
|
||||
__kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -426,7 +426,7 @@ __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, in
|
||||
__kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -437,7 +437,7 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -447,9 +447,9 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
@@ -460,17 +460,17 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
|
||||
uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ;
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -494,7 +494,7 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -504,9 +504,9 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
|
||||
@@ -517,17 +517,17 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
|
||||
char4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -539,7 +539,7 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
|
||||
__kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -550,7 +550,7 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -574,12 +574,12 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -590,7 +590,7 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -601,7 +601,7 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -625,12 +625,12 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -641,7 +641,7 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, in
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, in
|
||||
__kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -685,16 +685,16 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, i
|
||||
if (x < cols && y < rows)
|
||||
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
@@ -709,7 +709,7 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, i
|
||||
__kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -717,13 +717,13 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
@@ -735,7 +735,7 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
|
||||
short4 tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
@@ -745,7 +745,7 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
|
||||
__kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -768,7 +768,7 @@ __kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -790,7 +790,7 @@ __kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, i
|
||||
__kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -811,7 +811,7 @@ __kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -832,7 +832,7 @@ __kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step,
|
||||
__kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -853,7 +853,7 @@ __kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, in
|
||||
__kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -877,7 +877,7 @@ __kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, i
|
||||
__kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -897,17 +897,17 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 | src2_data_3;
|
||||
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -54,7 +54,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -65,7 +65,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -93,7 +93,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -131,7 +131,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -142,7 +142,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -166,7 +166,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -177,7 +177,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -201,7 +201,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int s
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -220,7 +220,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int s
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -230,7 +230,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int
|
||||
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
|
||||
char4 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -260,7 +260,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -279,7 +279,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src1_data | src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -289,7 +289,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -300,7 +300,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -326,7 +326,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -337,7 +337,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -362,7 +362,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -381,7 +381,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort2 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -390,7 +390,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int s
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -409,7 +409,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int s
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
short2 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -418,7 +418,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -437,7 +437,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -446,7 +446,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int sr
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -463,8 +463,8 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int sr
|
||||
char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
|
||||
char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
char8 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
char8 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
|
||||
}
|
||||
@@ -474,7 +474,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int sr
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -493,7 +493,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int sr
|
||||
short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index));
|
||||
|
||||
short8 data = src1_data | src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -514,7 +514,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -525,9 +525,9 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
@@ -540,17 +540,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
|
||||
uchar4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -564,7 +564,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -575,7 +575,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -586,9 +586,9 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
@@ -601,17 +601,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
|
||||
char4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -624,7 +624,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -635,7 +635,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -662,12 +662,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -679,7 +679,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -690,7 +690,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -717,12 +717,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -734,7 +734,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -742,7 +742,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -777,7 +777,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int sr
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -785,18 +785,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int sr
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
@@ -829,18 +829,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
|
||||
@@ -850,7 +850,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
|
||||
short4 tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int s
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -883,7 +883,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int s
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = src_data1 | src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -894,7 +894,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int sr
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -912,7 +912,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int sr
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
|
||||
char4 data = src_data1 | src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -922,7 +922,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -940,7 +940,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = src_data1 | src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -949,7 +949,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int s
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -967,7 +967,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int s
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src_data1 | src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -976,7 +976,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -994,7 +994,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = src_data1 | src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1003,7 +1003,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int sr
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1023,7 +1023,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int sr
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1055,7 +1055,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
|
||||
short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0));
|
||||
short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8));
|
||||
short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
|
||||
@@ -1065,10 +1065,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
|
||||
short4 data_1 = src1_data_1 | src2_data_1;
|
||||
short4 data_2 = src1_data_2 | src2_data_2;
|
||||
short4 data_3 = src1_data_3 | src2_data_3;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
|
||||
|
||||
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -99,8 +99,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -136,8 +136,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -174,8 +174,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
|
||||
@@ -65,8 +65,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -106,8 +106,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -147,8 +147,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -198,8 +198,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
|
||||
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data ^ src2_data;
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
|
||||
@@ -234,7 +234,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -266,7 +266,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_
|
||||
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
|
||||
char4 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -299,7 +299,7 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -324,8 +324,8 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -362,8 +362,8 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -408,7 +408,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort2 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -437,7 +437,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
short2 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -466,7 +466,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -495,7 +495,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -525,7 +525,7 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -549,8 +549,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -576,17 +576,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
|
||||
uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -611,8 +611,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -638,17 +638,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
|
||||
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -672,8 +672,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -700,12 +700,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -728,8 +728,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -756,12 +756,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -781,8 +781,8 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -825,15 +825,15 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
|
||||
char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
|
||||
char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
|
||||
char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
|
||||
@@ -870,15 +870,15 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
|
||||
char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
|
||||
char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
|
||||
char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
|
||||
char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
|
||||
@@ -930,7 +930,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -961,7 +961,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
|
||||
char4 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -991,7 +991,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1020,7 +1020,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1049,7 +1049,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1078,7 +1078,7 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1123,10 +1123,10 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
|
||||
char8 data_2 = src_data1_2 ^ src_data2_2;
|
||||
char8 data_3 = src_data1_3 ^ src_data2_3;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
|
||||
|
||||
@@ -64,7 +64,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -98,7 +98,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -132,7 +132,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -163,7 +163,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -249,7 +249,7 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, i
|
||||
|
||||
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
|
||||
short4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
@@ -269,7 +269,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -280,7 +280,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step,
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
@@ -303,7 +303,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -311,10 +311,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step,
|
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index);
|
||||
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
|
||||
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
|
||||
@@ -339,7 +339,7 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (__global ushort *src1, int src1_step
|
||||
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
|
||||
|
||||
ushort2 data = src_data1 ^ src_data2;
|
||||
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
@@ -360,7 +360,7 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (__global short *src1, int src1_step,
|
||||
short2 src_data2 = (short2)(src2.x, src2.y);
|
||||
|
||||
short2 data = src_data1 ^ src_data2;
|
||||
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
@@ -401,7 +401,7 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (__global char *src1, int src1_step,
|
||||
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
char8 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
@@ -423,7 +423,7 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, i
|
||||
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
short8 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
@@ -441,7 +441,7 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -451,9 +451,9 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
@@ -462,19 +462,19 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
|
||||
uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -497,7 +497,7 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -507,9 +507,9 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
|
||||
@@ -520,17 +520,17 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
|
||||
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -552,7 +552,7 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -576,12 +576,12 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -602,7 +602,7 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -626,12 +626,12 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step,
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -649,7 +649,7 @@ __kernel void arithm_s_bitwise_xor_C3_D4 (__global int *src1, int src1_step, i
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
@@ -683,16 +683,16 @@ __kernel void arithm_s_bitwise_xor_C3_D5 (__global char *src1, int src1_step,
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
|
||||
@@ -718,13 +718,13 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
@@ -736,7 +736,7 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
|
||||
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
@@ -864,7 +864,7 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (__global char *src1, int src1_step,
|
||||
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
char16 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
@@ -891,17 +891,17 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, i
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
|
||||
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -66,7 +66,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -104,7 +104,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -141,7 +141,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -154,7 +154,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int
|
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
||||
ushort2 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
|
||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
||||
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
|
||||
|
||||
@@ -175,7 +175,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -217,7 +217,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global int *src1, int
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -245,7 +245,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src
|
||||
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
|
||||
char4 data = src1_data ^ src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -274,7 +274,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int sr
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src1_data ^ src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -294,7 +294,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -330,7 +330,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int s
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -373,7 +373,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort2 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -400,7 +400,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global short *src1, int
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
short2 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -427,7 +427,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global int *src1, int sr
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = src_data1 ^ src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -454,7 +454,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global char *src1, int s
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src1_data ^ src2_data;
|
||||
|
||||
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
@@ -483,7 +483,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int sr
|
||||
short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index));
|
||||
|
||||
short8 data = src1_data ^ src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -503,7 +503,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -514,9 +514,9 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
@@ -529,17 +529,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
|
||||
uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -563,7 +563,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -574,9 +574,9 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
@@ -587,19 +587,19 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
|
||||
char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -622,7 +622,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -646,15 +646,15 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int
|
||||
ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -676,7 +676,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -703,12 +703,12 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -727,7 +727,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global int *src1, int sr
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -769,18 +769,18 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global char *src1, int s
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
@@ -812,18 +812,18 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
|
||||
@@ -833,7 +833,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr
|
||||
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
@@ -865,7 +865,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = src_data1 ^ src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -893,7 +893,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global char *src1, int s
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
|
||||
char4 data = src_data1 ^ src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -920,7 +920,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = src_data1 ^ src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -946,7 +946,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global short *src1, int
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src_data1 ^ src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -972,7 +972,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global int *src1, int sr
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = src_data1 ^ src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1000,7 +1000,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global char *src1, int s
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src1_data ^ src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1032,7 +1032,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
|
||||
short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0));
|
||||
short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8));
|
||||
short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
|
||||
@@ -1042,10 +1042,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
|
||||
short4 data_1 = src1_data_1 ^ src2_data_1;
|
||||
short4 data_2 = src1_data_2 ^ src2_data_2;
|
||||
short4 data_3 = src1_data_3 ^ src2_data_3;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
||||
data_2 = mask_data ? data_2 : dst_data_2;
|
||||
data_3 = mask_data ? data_3 : dst_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
|
||||
|
||||
@@ -63,8 +63,8 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -99,8 +99,8 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -137,8 +137,8 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -170,11 +170,11 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -206,8 +206,8 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -240,8 +240,8 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -276,8 +276,8 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -312,8 +312,8 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -350,8 +350,8 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -384,8 +384,8 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -417,8 +417,8 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -451,8 +451,8 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -487,8 +487,8 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -525,8 +525,8 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -563,8 +563,8 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -598,8 +598,8 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -632,8 +632,8 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -667,8 +667,8 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
|
||||
@@ -59,8 +59,8 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -97,8 +97,8 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -135,8 +135,8 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -169,8 +169,8 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -202,8 +202,8 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -236,8 +236,8 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -258,7 +258,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/***********************************Compare LT*******************************/
|
||||
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
@@ -273,8 +273,8 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -311,8 +311,8 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -349,8 +349,8 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -383,8 +383,8 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -416,8 +416,8 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -450,8 +450,8 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -486,8 +486,8 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -524,8 +524,8 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -562,8 +562,8 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -596,8 +596,8 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -629,8 +629,8 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -663,8 +663,8 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
|
||||
@@ -49,7 +49,7 @@ typedef double F ;
|
||||
typedef double4 F4;
|
||||
#define convert_F4 convert_double4
|
||||
#define convert_F double
|
||||
#else
|
||||
#else
|
||||
typedef float F;
|
||||
typedef float4 F4;
|
||||
#define convert_F4 convert_float4
|
||||
@@ -102,8 +102,8 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -143,8 +143,8 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -183,8 +183,8 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -298,7 +298,7 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src_index = mad24(y, src_step, x + src_offset - dst_align);
|
||||
int src_index = mad24(y, src_step, x + src_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -334,7 +334,7 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
|
||||
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -369,7 +369,7 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
|
||||
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
|
||||
@@ -84,7 +84,7 @@ __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int sr
|
||||
|
||||
double src_data = *((__global double *)((__global char *)src + srcIdx));
|
||||
double dst_data = exp(src_data);
|
||||
|
||||
|
||||
*((__global double *)((__global char *)dst + dstIdx )) = dst_data;
|
||||
// dst[dstIdx] = exp(src[srcIdx]);
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////////flip rows///////////////////////////////////////////////
|
||||
/////////////////////////////////////////////flip rows///////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
@@ -62,8 +62,8 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
|
||||
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
|
||||
|
||||
int dst_start_0 = mad24(y, dst_step, dst_offset);
|
||||
int dst_start_1 = mad24(rows - y - 1, dst_step, dst_offset);
|
||||
@@ -71,22 +71,22 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
|
||||
int dst_end_1 = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
|
||||
int dst_index_0 = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
|
||||
int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
|
||||
int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
|
||||
int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
|
||||
uchar4 src_data_0 = vload4(0, src + src1_index_fix);
|
||||
uchar4 src_data_1 = vload4(0, src + src2_index_fix);
|
||||
if(src_index_0 < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
|
||||
src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src_index_1 < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
|
||||
src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src_index_0 < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
|
||||
src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src_index_1 < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
|
||||
src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data_0 = *((__global uchar4 *)(dst + dst_index_0));
|
||||
uchar4 dst_data_1 = *((__global uchar4 *)(dst + dst_index_1));
|
||||
@@ -117,8 +117,8 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
|
||||
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
|
||||
|
||||
int dst_start_0 = mad24(y, dst_step, dst_offset);
|
||||
int dst_start_1 = mad24(rows - y - 1, dst_step, dst_offset);
|
||||
@@ -159,8 +159,8 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset >> 1) & 3) << 1)
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
|
||||
|
||||
int dst_start_0 = mad24(y, dst_step, dst_offset);
|
||||
int dst_start_1 = mad24(rows - y - 1, dst_step, dst_offset);
|
||||
@@ -201,8 +201,8 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset >> 1) & 3) << 1)
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
|
||||
|
||||
int dst_start_0 = mad24(y, dst_step, dst_offset);
|
||||
int dst_start_1 = mad24(rows - y - 1, dst_step, dst_offset);
|
||||
@@ -243,7 +243,7 @@ __kernel void arithm_flip_rows_D4 (__global int *src, int src_step, int src_offs
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
@@ -265,7 +265,7 @@ __kernel void arithm_flip_rows_D5 (__global float *src, int src_step, int src_of
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
@@ -289,7 +289,7 @@ __kernel void arithm_flip_rows_D6 (__global double *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
@@ -302,7 +302,7 @@ __kernel void arithm_flip_rows_D6 (__global double *src, int src_step, int src_o
|
||||
}
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////////flip cols///////////////////////////////////////////////
|
||||
/////////////////////////////////////////////flip cols///////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
@@ -315,7 +315,7 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
|
||||
|
||||
@@ -337,7 +337,7 @@ __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
|
||||
|
||||
@@ -359,7 +359,7 @@ __kernel void arithm_flip_cols_C1_D2 (__global ushort *src, int src_step, int sr
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
@@ -381,7 +381,7 @@ __kernel void arithm_flip_cols_C1_D3 (__global short *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
@@ -403,7 +403,7 @@ __kernel void arithm_flip_cols_C1_D4 (__global int *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -425,7 +425,7 @@ __kernel void arithm_flip_cols_C1_D5 (__global float *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -449,7 +449,7 @@ __kernel void arithm_flip_cols_C1_D6 (__global double *src, int src_step, int sr
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -472,7 +472,7 @@ __kernel void arithm_flip_cols_C2_D0 (__global uchar *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
@@ -494,7 +494,7 @@ __kernel void arithm_flip_cols_C2_D1 (__global char *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
@@ -516,7 +516,7 @@ __kernel void arithm_flip_cols_C2_D2 (__global ushort *src, int src_step, int sr
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -538,7 +538,7 @@ __kernel void arithm_flip_cols_C2_D3 (__global short *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -560,7 +560,7 @@ __kernel void arithm_flip_cols_C2_D4 (__global int *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -582,7 +582,7 @@ __kernel void arithm_flip_cols_C2_D5 (__global float *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -606,7 +606,7 @@ __kernel void arithm_flip_cols_C2_D6 (__global double *src, int src_step, int sr
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
@@ -630,7 +630,7 @@ __kernel void arithm_flip_cols_C3_D0 (__global uchar *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) * 3 + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) * 3 + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);
|
||||
|
||||
@@ -662,7 +662,7 @@ __kernel void arithm_flip_cols_C3_D1 (__global char *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) * 3 + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) * 3 + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);
|
||||
|
||||
@@ -694,7 +694,7 @@ __kernel void arithm_flip_cols_C3_D2 (__global ushort *src, int src_step, int sr
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
|
||||
|
||||
@@ -726,7 +726,7 @@ __kernel void arithm_flip_cols_C3_D3 (__global short *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
|
||||
|
||||
@@ -758,14 +758,14 @@ __kernel void arithm_flip_cols_C3_D4 (__global int *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
|
||||
|
||||
int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
|
||||
int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
|
||||
int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
|
||||
|
||||
|
||||
int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
|
||||
int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
|
||||
int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
|
||||
@@ -773,7 +773,7 @@ __kernel void arithm_flip_cols_C3_D4 (__global int *src, int src_step, int src_o
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
|
||||
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
|
||||
@@ -790,14 +790,14 @@ __kernel void arithm_flip_cols_C3_D5 (__global float *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
|
||||
|
||||
float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
|
||||
float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
|
||||
float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
|
||||
|
||||
|
||||
float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
|
||||
float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
|
||||
float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
|
||||
@@ -805,7 +805,7 @@ __kernel void arithm_flip_cols_C3_D5 (__global float *src, int src_step, int src
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
|
||||
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
|
||||
@@ -824,14 +824,14 @@ __kernel void arithm_flip_cols_C3_D6 (__global double *src, int src_step, int sr
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 3) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
|
||||
|
||||
double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0));
|
||||
double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8));
|
||||
double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
|
||||
|
||||
|
||||
double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0));
|
||||
double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8));
|
||||
double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
|
||||
@@ -839,7 +839,7 @@ __kernel void arithm_flip_cols_C3_D6 (__global double *src, int src_step, int sr
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
|
||||
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
|
||||
@@ -857,7 +857,7 @@ __kernel void arithm_flip_cols_C4_D0 (__global uchar *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -879,7 +879,7 @@ __kernel void arithm_flip_cols_C4_D1 (__global char *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -901,7 +901,7 @@ __kernel void arithm_flip_cols_C4_D2 (__global ushort *src, int src_step, int sr
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -923,7 +923,7 @@ __kernel void arithm_flip_cols_C4_D3 (__global short *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -946,7 +946,7 @@ __kernel void arithm_flip_cols_C4_D4 (__global int *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
@@ -968,7 +968,7 @@ __kernel void arithm_flip_cols_C4_D5 (__global float *src, int src_step, int src
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
@@ -991,7 +991,7 @@ __kernel void arithm_flip_cols_C4_D6 (__global double *src, int src_step, int sr
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 5) + src_offset);
|
||||
int src_index_1 = mad24(y, src_step, ((cols - x -1) << 5) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 5) + dst_offset);
|
||||
int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 5) + dst_offset);
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ __kernel void arithm_flip_rc_C1_D0 (__global uchar *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
|
||||
|
||||
@@ -82,7 +82,7 @@ __kernel void arithm_flip_rc_C1_D1 (__global char *src, int src_step, int src_of
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
|
||||
|
||||
@@ -104,7 +104,7 @@ __kernel void arithm_flip_rc_C1_D2 (__global ushort *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
@@ -126,7 +126,7 @@ __kernel void arithm_flip_rc_C1_D3 (__global short *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
@@ -148,7 +148,7 @@ __kernel void arithm_flip_rc_C1_D4 (__global int *src, int src_step, int src_off
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -170,7 +170,7 @@ __kernel void arithm_flip_rc_C1_D5 (__global float *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -194,7 +194,7 @@ __kernel void arithm_flip_rc_C1_D6 (__global double *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -217,7 +217,7 @@ __kernel void arithm_flip_rc_C2_D0 (__global uchar *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
@@ -239,7 +239,7 @@ __kernel void arithm_flip_rc_C2_D1 (__global char *src, int src_step, int src_of
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
@@ -261,7 +261,7 @@ __kernel void arithm_flip_rc_C2_D2 (__global ushort *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -283,7 +283,7 @@ __kernel void arithm_flip_rc_C2_D3 (__global short *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -305,7 +305,7 @@ __kernel void arithm_flip_rc_C2_D4 (__global int *src, int src_step, int src_off
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -327,7 +327,7 @@ __kernel void arithm_flip_rc_C2_D5 (__global float *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -351,7 +351,7 @@ __kernel void arithm_flip_rc_C2_D6 (__global double *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
@@ -375,7 +375,7 @@ __kernel void arithm_flip_rc_C3_D0 (__global uchar *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3 + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
|
||||
|
||||
@@ -408,7 +408,7 @@ __kernel void arithm_flip_rc_C3_D1 (__global char *src, int src_step, int src_of
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3 + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
|
||||
|
||||
@@ -441,7 +441,7 @@ __kernel void arithm_flip_rc_C3_D2 (__global ushort *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
|
||||
|
||||
@@ -473,7 +473,7 @@ __kernel void arithm_flip_rc_C3_D3 (__global short *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
|
||||
|
||||
@@ -506,14 +506,14 @@ __kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_off
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
|
||||
|
||||
int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
|
||||
int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
|
||||
int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
|
||||
|
||||
|
||||
int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
|
||||
int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
|
||||
int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
|
||||
@@ -521,7 +521,7 @@ __kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_off
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
|
||||
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
|
||||
@@ -538,14 +538,14 @@ __kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
|
||||
|
||||
float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
|
||||
float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
|
||||
float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
|
||||
|
||||
|
||||
float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
|
||||
float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
|
||||
float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
|
||||
@@ -553,7 +553,7 @@ __kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_o
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
|
||||
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
|
||||
@@ -572,14 +572,14 @@ __kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
|
||||
|
||||
double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0 ));
|
||||
double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8 ));
|
||||
double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
|
||||
|
||||
|
||||
double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0 ));
|
||||
double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8 ));
|
||||
double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
|
||||
@@ -587,7 +587,7 @@ __kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
|
||||
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
|
||||
@@ -605,7 +605,7 @@ __kernel void arithm_flip_rc_C4_D0 (__global uchar *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -627,7 +627,7 @@ __kernel void arithm_flip_rc_C4_D1 (__global char *src, int src_step, int src_of
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
@@ -649,7 +649,7 @@ __kernel void arithm_flip_rc_C4_D2 (__global ushort *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -671,7 +671,7 @@ __kernel void arithm_flip_rc_C4_D3 (__global short *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
@@ -693,7 +693,7 @@ __kernel void arithm_flip_rc_C4_D4 (__global int *src, int src_step, int src_off
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
@@ -715,7 +715,7 @@ __kernel void arithm_flip_rc_C4_D5 (__global float *src, int src_step, int src_o
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
@@ -739,7 +739,7 @@ __kernel void arithm_flip_rc_C4_D6 (__global double *src, int src_step, int src_
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 5) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 5) + src_offset);
|
||||
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 5) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 5) + dst_offset);
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
#endif
|
||||
|
||||
#define INF_FLOAT -88.029694
|
||||
#define INF_DOUBLE -709.0895657128241
|
||||
#define INF_DOUBLE -709.0895657128241
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
@@ -112,16 +112,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
|
||||
@@ -57,37 +57,37 @@
|
||||
#if defined (DEPTH_1)
|
||||
#define VEC_TYPE char8
|
||||
#define CONVERT_TYPE convert_char8
|
||||
#define MIN_VAL -128
|
||||
#define MIN_VAL -128
|
||||
#define MAX_VAL 127
|
||||
#endif
|
||||
#if defined (DEPTH_2)
|
||||
#define VEC_TYPE ushort8
|
||||
#define CONVERT_TYPE convert_ushort8
|
||||
#define MIN_VAL 0
|
||||
#define MIN_VAL 0
|
||||
#define MAX_VAL 65535
|
||||
#endif
|
||||
#if defined (DEPTH_3)
|
||||
#define VEC_TYPE short8
|
||||
#define CONVERT_TYPE convert_short8
|
||||
#define MIN_VAL -32768
|
||||
#define MIN_VAL -32768
|
||||
#define MAX_VAL 32767
|
||||
#endif
|
||||
#if defined (DEPTH_4)
|
||||
#define VEC_TYPE int8
|
||||
#define CONVERT_TYPE convert_int8
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MAX_VAL INT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_5)
|
||||
#define VEC_TYPE float8
|
||||
#define CONVERT_TYPE convert_float8
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MAX_VAL FLT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_6)
|
||||
#define VEC_TYPE double8
|
||||
#define CONVERT_TYPE convert_double8
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MAX_VAL DBL_MAX
|
||||
#endif
|
||||
|
||||
@@ -157,7 +157,7 @@ __kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum
|
||||
if(id < elemnum)
|
||||
{
|
||||
temp = src[idx];
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
}
|
||||
@@ -177,7 +177,7 @@ __kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = src[idx];
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
}
|
||||
|
||||
@@ -66,7 +66,7 @@
|
||||
#define VEC_TYPE_LOC int4
|
||||
#define CONVERT_TYPE convert_char4
|
||||
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
|
||||
#define MIN_VAL -128
|
||||
#define MIN_VAL -128
|
||||
#define MAX_VAL 127
|
||||
#endif
|
||||
#if defined (DEPTH_2)
|
||||
@@ -74,7 +74,7 @@
|
||||
#define VEC_TYPE_LOC int4
|
||||
#define CONVERT_TYPE convert_ushort4
|
||||
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
|
||||
#define MIN_VAL 0
|
||||
#define MIN_VAL 0
|
||||
#define MAX_VAL 65535
|
||||
#endif
|
||||
#if defined (DEPTH_3)
|
||||
@@ -82,7 +82,7 @@
|
||||
#define VEC_TYPE_LOC int4
|
||||
#define CONVERT_TYPE convert_short4
|
||||
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
|
||||
#define MIN_VAL -32768
|
||||
#define MIN_VAL -32768
|
||||
#define MAX_VAL 32767
|
||||
#endif
|
||||
#if defined (DEPTH_4)
|
||||
@@ -90,7 +90,7 @@
|
||||
#define VEC_TYPE_LOC int4
|
||||
#define CONVERT_TYPE convert_int4
|
||||
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MAX_VAL INT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_5)
|
||||
@@ -98,7 +98,7 @@
|
||||
#define VEC_TYPE_LOC float4
|
||||
#define CONVERT_TYPE convert_float4
|
||||
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MAX_VAL FLT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_6)
|
||||
@@ -106,12 +106,12 @@
|
||||
#define VEC_TYPE_LOC double4
|
||||
#define CONVERT_TYPE convert_double4
|
||||
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MAX_VAL DBL_MAX
|
||||
#endif
|
||||
|
||||
#if defined (REPEAT_S0)
|
||||
#define repeat_s(a) a=a;
|
||||
#define repeat_s(a) a=a;
|
||||
#endif
|
||||
#if defined (REPEAT_S1)
|
||||
#define repeat_s(a) a.s0 = a.s1;
|
||||
@@ -125,7 +125,7 @@
|
||||
|
||||
|
||||
#if defined (REPEAT_E0)
|
||||
#define repeat_e(a) a=a;
|
||||
#define repeat_e(a) a=a;
|
||||
#endif
|
||||
#if defined (REPEAT_E1)
|
||||
#define repeat_e(a) a.s3 = a.s2;
|
||||
@@ -159,7 +159,7 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
|
||||
temp = src[idx];
|
||||
idx_c = idx << 2;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
repeat_s(temploc);
|
||||
@@ -188,7 +188,7 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
|
||||
temp = src[idx];
|
||||
idx_c = idx << 2;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
repeat_s(temploc);
|
||||
@@ -228,9 +228,9 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
|
||||
localmem_minloc[lid] =
|
||||
localmem_minloc[lid] =
|
||||
CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
|
||||
localmem_maxloc[lid] =
|
||||
localmem_maxloc[lid] =
|
||||
CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -291,7 +291,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
|
||||
m_temp = CONVERT_TYPE(mask[midx]);
|
||||
int idx_c = idx << 2;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_ms(m_temp);
|
||||
repeat_s(temploc);
|
||||
@@ -321,7 +321,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
|
||||
m_temp = CONVERT_TYPE(mask[midx]);
|
||||
int idx_c = idx << 2;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_ms(m_temp);
|
||||
repeat_s(temploc);
|
||||
@@ -333,7 +333,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
|
||||
}
|
||||
minval = min(minval,m_temp > zero ? temp : max_val);
|
||||
maxval = max(maxval,m_temp > zero ? temp : min_val);
|
||||
|
||||
|
||||
temploc = CONDITION_FUNC(m_temp > zero, temploc , negative);
|
||||
minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
|
||||
maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
|
||||
@@ -361,9 +361,9 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
|
||||
localmem_minloc[lid] =
|
||||
localmem_minloc[lid] =
|
||||
CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
|
||||
localmem_maxloc[lid] =
|
||||
localmem_maxloc[lid] =
|
||||
CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
#define VEC_TYPE_LOC int4
|
||||
#define CONVERT_TYPE convert_char4
|
||||
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
|
||||
#define MIN_VAL -128
|
||||
#define MIN_VAL -128
|
||||
#define MAX_VAL 127
|
||||
#endif
|
||||
#if defined (DEPTH_2)
|
||||
@@ -77,7 +77,7 @@
|
||||
#define VEC_TYPE_LOC int4
|
||||
#define CONVERT_TYPE convert_ushort4
|
||||
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
|
||||
#define MIN_VAL 0
|
||||
#define MIN_VAL 0
|
||||
#define MAX_VAL 65535
|
||||
#endif
|
||||
#if defined (DEPTH_3)
|
||||
@@ -86,7 +86,7 @@
|
||||
#define VEC_TYPE_LOC int4
|
||||
#define CONVERT_TYPE convert_short4
|
||||
#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
|
||||
#define MIN_VAL -32768
|
||||
#define MIN_VAL -32768
|
||||
#define MAX_VAL 32767
|
||||
#endif
|
||||
#if defined (DEPTH_4)
|
||||
@@ -95,7 +95,7 @@
|
||||
#define VEC_TYPE_LOC int4
|
||||
#define CONVERT_TYPE convert_int4
|
||||
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MAX_VAL INT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_5)
|
||||
@@ -104,7 +104,7 @@
|
||||
#define VEC_TYPE_LOC float4
|
||||
#define CONVERT_TYPE convert_float4
|
||||
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MAX_VAL FLT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_6)
|
||||
@@ -113,12 +113,12 @@
|
||||
#define VEC_TYPE_LOC double4
|
||||
#define CONVERT_TYPE convert_double4
|
||||
#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MAX_VAL DBL_MAX
|
||||
#endif
|
||||
|
||||
#if defined (REPEAT_E0)
|
||||
#define repeat_e(a) a=a;
|
||||
#define repeat_e(a) a=a;
|
||||
#endif
|
||||
#if defined (REPEAT_E1)
|
||||
#define repeat_e(a) a.s3 = a.s2;
|
||||
@@ -194,7 +194,7 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
|
||||
}
|
||||
minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
|
||||
maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
|
||||
|
||||
|
||||
minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
|
||||
maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
|
||||
}
|
||||
@@ -225,9 +225,9 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
|
||||
lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
|
||||
VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
|
||||
VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
|
||||
lm_minloc[lid] =
|
||||
lm_minloc[lid] =
|
||||
CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
|
||||
lm_maxloc[lid] =
|
||||
lm_maxloc[lid] =
|
||||
CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -59,42 +59,42 @@
|
||||
#define VEC_TYPE char8
|
||||
#define TYPE char
|
||||
#define CONVERT_TYPE convert_char8
|
||||
#define MIN_VAL -128
|
||||
#define MIN_VAL -128
|
||||
#define MAX_VAL 127
|
||||
#endif
|
||||
#if defined (DEPTH_2)
|
||||
#define VEC_TYPE ushort8
|
||||
#define TYPE ushort
|
||||
#define CONVERT_TYPE convert_ushort8
|
||||
#define MIN_VAL 0
|
||||
#define MIN_VAL 0
|
||||
#define MAX_VAL 65535
|
||||
#endif
|
||||
#if defined (DEPTH_3)
|
||||
#define VEC_TYPE short8
|
||||
#define TYPE short
|
||||
#define CONVERT_TYPE convert_short8
|
||||
#define MIN_VAL -32768
|
||||
#define MIN_VAL -32768
|
||||
#define MAX_VAL 32767
|
||||
#endif
|
||||
#if defined (DEPTH_4)
|
||||
#define VEC_TYPE int8
|
||||
#define TYPE int
|
||||
#define CONVERT_TYPE convert_int8
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MAX_VAL INT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_5)
|
||||
#define VEC_TYPE float8
|
||||
#define TYPE float
|
||||
#define CONVERT_TYPE convert_float8
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MAX_VAL FLT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_6)
|
||||
#define VEC_TYPE double8
|
||||
#define TYPE double
|
||||
#define CONVERT_TYPE convert_double8
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MAX_VAL DBL_MAX
|
||||
#endif
|
||||
|
||||
|
||||
@@ -48,23 +48,23 @@
|
||||
#endif
|
||||
|
||||
int4 round_int4(float4 v){
|
||||
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
|
||||
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
|
||||
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
|
||||
v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
|
||||
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
|
||||
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
|
||||
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
|
||||
v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
|
||||
|
||||
return convert_int4_sat(v);
|
||||
}
|
||||
uint4 round_uint4(float4 v){
|
||||
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
|
||||
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
|
||||
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
|
||||
v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
|
||||
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
|
||||
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
|
||||
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
|
||||
v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
|
||||
|
||||
return convert_uint4_sat(v);
|
||||
}
|
||||
long round_int(float v){
|
||||
v = v + (v > 0 ? 0.5 : -0.5);
|
||||
v = v + (v > 0 ? 0.5 : -0.5);
|
||||
|
||||
return convert_int_sat(v);
|
||||
}
|
||||
@@ -85,24 +85,24 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data ,src2_data;
|
||||
uchar4 src1_data ,src2_data;
|
||||
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
int4 tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
|
||||
@@ -130,8 +130,8 @@ __kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offs
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -166,8 +166,8 @@ __kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
|
||||
@@ -137,7 +137,7 @@ __kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnu
|
||||
if(id < elemnum)
|
||||
{
|
||||
temp = src[idx];
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
}
|
||||
@@ -155,7 +155,7 @@ __kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnu
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = src[idx];
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
}
|
||||
|
||||
@@ -67,7 +67,7 @@ __kernel void arithm_phase_inradians_D5 (__global float *src1, int src1_step, in
|
||||
float data1 = *((__global float *)((__global char *)src1 + src1_index));
|
||||
float data2 = *((__global float *)((__global char *)src2 + src2_index));
|
||||
float tmp = atan2(data2,data1);
|
||||
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
|
||||
@@ -92,7 +92,7 @@ __kernel void arithm_phase_inradians_D6 (__global double *src1, int src1_step, i
|
||||
|
||||
double data1 = *((__global double *)((__global char *)src1 + src1_index));
|
||||
double data2 = *((__global double *)((__global char *)src2 + src2_index));
|
||||
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index)) = atan2(data2,data1);
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ __kernel void arithm_phase_indegrees_D5 (__global float *src1, int src1_step, in
|
||||
float data2 = *((__global float *)((__global char *)src2 + src2_index));
|
||||
float tmp = atan2(data2,data1);
|
||||
float tmp_data = 180*tmp/CV_PI;
|
||||
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
|
||||
@@ -146,7 +146,7 @@ __kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step, i
|
||||
double data2 = *((__global double *)((__global char *)src2 + src2_index));
|
||||
double tmp = atan2(data2,data1);
|
||||
double tmp_data = 180*tmp/CV_PI;
|
||||
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
|
||||
|
||||
@@ -54,8 +54,8 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, int src1_offset,//magnitue
|
||||
__global float *src2, int src2_step, int src2_offset,//angle
|
||||
__global float *dst1, int dst1_step, int dst1_offset,
|
||||
__global float *dst2, int dst2_step, int dst2_offset,
|
||||
__global float *dst1, int dst1_step, int dst1_offset,
|
||||
__global float *dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int angInDegree)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -74,7 +74,7 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in
|
||||
|
||||
float ascale = CV_PI/180.0;
|
||||
float alpha = angInDegree == 1 ? y * ascale : y;
|
||||
float a = cos(alpha) * x;
|
||||
float a = cos(alpha) * x;
|
||||
float b = sin(alpha) * x;
|
||||
|
||||
*((__global float *)((__global char *)dst1 + dst1_index)) = a;
|
||||
@@ -85,8 +85,8 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, int src1_offset,//magnitue
|
||||
__global double *src2, int src2_step, int src2_offset,//angle
|
||||
__global double *dst1, int dst1_step, int dst1_offset,
|
||||
__global double *dst2, int dst2_step, int dst2_offset,
|
||||
__global double *dst1, int dst1_step, int dst1_offset,
|
||||
__global double *dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int angInDegree)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -105,7 +105,7 @@ __kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, i
|
||||
|
||||
float ascale = CV_PI/180.0;
|
||||
double alpha = angInDegree == 1 ? y * ascale : y;
|
||||
double a = cos(alpha) * x;
|
||||
double a = cos(alpha) * x;
|
||||
double b = sin(alpha) * x;
|
||||
|
||||
*((__global double *)((__global char *)dst1 + dst1_index)) = a;
|
||||
@@ -118,8 +118,8 @@ __kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, i
|
||||
/////////////////////////////////////////polarToCart without magnitude//////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int src_offset,//angle
|
||||
__global float *dst1, int dst1_step, int dst1_offset,
|
||||
__global float *dst2, int dst2_step, int dst2_offset,
|
||||
__global float *dst1, int dst1_step, int dst1_offset,
|
||||
__global float *dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int angInDegree)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -136,7 +136,7 @@ __kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int sr
|
||||
|
||||
float ascale = CV_PI/180.0;
|
||||
float alpha = angInDegree == 1 ? y * ascale : y;
|
||||
float a = cos(alpha);
|
||||
float a = cos(alpha);
|
||||
float b = sin(alpha);
|
||||
|
||||
*((__global float *)((__global char *)dst1 + dst1_index)) = a;
|
||||
@@ -146,8 +146,8 @@ __kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int sr
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_polarToCart_D6 (__global float *src, int src_step, int src_offset,//angle
|
||||
__global float *dst1, int dst1_step, int dst1_offset,
|
||||
__global float *dst2, int dst2_step, int dst2_offset,
|
||||
__global float *dst1, int dst1_step, int dst1_offset,
|
||||
__global float *dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int angInDegree)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -164,7 +164,7 @@ __kernel void arithm_polarToCart_D6 (__global float *src, int src_step, int sr
|
||||
|
||||
float ascale = CV_PI/180.0;
|
||||
double alpha = angInDegree == 1 ? y * ascale : y;
|
||||
double a = cos(alpha);
|
||||
double a = cos(alpha);
|
||||
double b = sin(alpha);
|
||||
|
||||
*((__global double *)((__global char *)dst1 + dst1_index)) = a;
|
||||
|
||||
@@ -70,7 +70,7 @@ __kernel void arithm_pow_D5 (__global float *src1, int src1_step, int src1_offse
|
||||
|
||||
float src1_data = *((__global float *)((__global char *)src1 + src1_index));
|
||||
float tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
|
||||
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
|
||||
@@ -92,7 +92,7 @@ __kernel void arithm_pow_D6 (__global double *src1, int src1_step, int src1_offs
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
double src1_data = *((__global double *)((__global char *)src1 + src1_index));
|
||||
double tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
|
||||
double tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
|
||||
*((__global double *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
|
||||
|
||||
@@ -64,8 +64,8 @@ __kernel void arithm_sub_D0 (__global uchar *src1, int src1_step, int src1_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -99,8 +99,8 @@ __kernel void arithm_sub_D2 (__global ushort *src1, int src1_step, int src1_offs
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -134,8 +134,8 @@ __kernel void arithm_sub_D3 (__global short *src1, int src1_step, int src1_offse
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -240,8 +240,8 @@ __kernel void arithm_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -279,8 +279,8 @@ __kernel void arithm_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -316,8 +316,8 @@ __kernel void arithm_sub_with_mask_C1_D3 (__global short *src1, int src1_step, i
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -362,7 +362,7 @@ __kernel void arithm_sub_with_mask_C1_D4 (__global int *src1, int src1_step, i
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = convert_int_sat((long)src_data1 - (long)src_data2);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -392,7 +392,7 @@ __kernel void arithm_sub_with_mask_C1_D5 (__global float *src1, int src1_step, i
|
||||
float dst_data = *((__global float *)((__global char *)dst + dst_index));
|
||||
|
||||
float data = src_data1 - src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -424,7 +424,7 @@ __kernel void arithm_sub_with_mask_C1_D6 (__global double *src1, int src1_step,
|
||||
double dst_data = *((__global double *)((__global char *)dst + dst_index));
|
||||
|
||||
double data = src_data1 - src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -446,8 +446,8 @@ __kernel void arithm_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -493,7 +493,7 @@ __kernel void arithm_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step,
|
||||
|
||||
int2 tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2);
|
||||
ushort2 data = convert_ushort2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -523,7 +523,7 @@ __kernel void arithm_sub_with_mask_C2_D3 (__global short *src1, int src1_step, i
|
||||
|
||||
int2 tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2);
|
||||
short2 data = convert_short2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -552,7 +552,7 @@ __kernel void arithm_sub_with_mask_C2_D4 (__global int *src1, int src1_step, i
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = convert_int2_sat(convert_long2_sat(src_data1) - convert_long2_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -581,7 +581,7 @@ __kernel void arithm_sub_with_mask_C2_D5 (__global float *src1, int src1_step, i
|
||||
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
|
||||
|
||||
float2 data = src_data1 - src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -612,7 +612,7 @@ __kernel void arithm_sub_with_mask_C2_D6 (__global double *src1, int src1_step,
|
||||
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
|
||||
|
||||
double2 data = src_data1 - src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -633,8 +633,8 @@ __kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -660,17 +660,17 @@ __kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
|
||||
uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) - convert_short4_sat(src2_data_2));
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -693,8 +693,8 @@ __kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step,
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -721,12 +721,12 @@ __kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step,
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -749,8 +749,8 @@ __kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, i
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -777,12 +777,12 @@ __kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, i
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -802,8 +802,8 @@ __kernel void arithm_sub_with_mask_C3_D4 (__global int *src1, int src1_step, i
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -846,15 +846,15 @@ __kernel void arithm_sub_with_mask_C3_D5 (__global float *src1, int src1_step, i
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
|
||||
float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
|
||||
float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
|
||||
float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
|
||||
@@ -892,15 +892,15 @@ __kernel void arithm_sub_with_mask_C3_D6 (__global double *src1, int src1_step,
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
|
||||
double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
|
||||
double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
|
||||
@@ -949,7 +949,7 @@ __kernel void arithm_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = convert_uchar4_sat(convert_short4_sat(src_data1) - convert_short4_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -978,7 +978,7 @@ __kernel void arithm_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step,
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1007,7 +1007,7 @@ __kernel void arithm_sub_with_mask_C4_D3 (__global short *src1, int src1_step, i
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = convert_short4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1036,7 +1036,7 @@ __kernel void arithm_sub_with_mask_C4_D4 (__global int *src1, int src1_step, i
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = convert_int4_sat(convert_long4_sat(src_data1) - convert_long4_sat(src_data2));
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1065,7 +1065,7 @@ __kernel void arithm_sub_with_mask_C4_D5 (__global float *src1, int src1_step, i
|
||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
||||
|
||||
float4 data = src_data1 - src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -1096,7 +1096,7 @@ __kernel void arithm_sub_with_mask_C4_D6 (__global double *src1, int src1_step,
|
||||
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
|
||||
|
||||
double4 data = src_data1 - src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ __kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -94,7 +94,7 @@ __kernel void arithm_s_sub_C1_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -127,7 +127,7 @@ __kernel void arithm_s_sub_C1_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -231,7 +231,7 @@ __kernel void arithm_s_sub_C2_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -385,7 +385,7 @@ __kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -395,9 +395,9 @@ __kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
||||
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
|
||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
@@ -416,17 +416,17 @@ __kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src
|
||||
uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -447,7 +447,7 @@ __kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -479,12 +479,12 @@ __kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -505,7 +505,7 @@ __kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@@ -537,12 +537,12 @@ __kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -560,7 +560,7 @@ __kernel void arithm_s_sub_C3_D4 (__global int *src1, int src1_step, int src1_
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
@@ -602,13 +602,13 @@ __kernel void arithm_s_sub_C3_D5 (__global float *src1, int src1_step, int src
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
|
||||
float src2_data_0 = src2.x;
|
||||
float src2_data_1 = src2.y;
|
||||
float src2_data_2 = src2.z;
|
||||
@@ -642,13 +642,13 @@ __kernel void arithm_s_sub_C3_D6 (__global double *src1, int src1_step, int sr
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
|
||||
double src2_data_0 = src2.x;
|
||||
double src2_data_1 = src2.y;
|
||||
double src2_data_2 = src2.z;
|
||||
|
||||
@@ -62,7 +62,7 @@ __kernel void arithm_s_sub_with_mask_C1_D0 (__global uchar *src1, int src1_ste
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -100,7 +100,7 @@ __kernel void arithm_s_sub_with_mask_C1_D2 (__global ushort *src1, int src1_st
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -136,7 +136,7 @@ __kernel void arithm_s_sub_with_mask_C1_D3 (__global short *src1, int src1_ste
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -182,7 +182,7 @@ __kernel void arithm_s_sub_with_mask_C1_D4 (__global int *src1, int src1_ste
|
||||
long tmp = (long)src_data1 - (long)src_data2;
|
||||
tmp = isMatSubScalar ? tmp : - tmp;
|
||||
int data = convert_int_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -211,7 +211,7 @@ __kernel void arithm_s_sub_with_mask_C1_D5 (__global float *src1, int src1_s
|
||||
|
||||
float data = src_data1 - src_data2;
|
||||
data = isMatSubScalar ? data : -data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -242,7 +242,7 @@ __kernel void arithm_s_sub_with_mask_C1_D6 (__global double *src1, int src1_
|
||||
|
||||
double data = src_data1 - src_data2;
|
||||
data = isMatSubScalar ? data : -data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -262,7 +262,7 @@ __kernel void arithm_s_sub_with_mask_C2_D0 (__global uchar *src1, int src1_ste
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -302,13 +302,13 @@ __kernel void arithm_s_sub_with_mask_C2_D2 (__global ushort *src1, int src1_st
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 tmp = convert_int2_sat(src_data1) - src_data2;
|
||||
tmp = isMatSubScalar ? tmp : -tmp;
|
||||
ushort2 data = convert_ushort2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -331,13 +331,13 @@ __kernel void arithm_s_sub_with_mask_C2_D3 (__global short *src1, int src1_ste
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 tmp = convert_int2_sat(src_data1) - src_data2;
|
||||
tmp = isMatSubScalar ? tmp : -tmp;
|
||||
short2 data = convert_short2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -360,13 +360,13 @@ __kernel void arithm_s_sub_with_mask_C2_D4 (__global int *src1, int src1_step,
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
|
||||
tmp = isMatSubScalar ? tmp : -tmp;
|
||||
int2 data = convert_int2_sat(tmp);
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -389,12 +389,12 @@ __kernel void arithm_s_sub_with_mask_C2_D5 (__global float *src1, int src1_ste
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
|
||||
float2 src_data2 = (float2)(src2.x, src2.y);
|
||||
float2 src_data2 = (float2)(src2.x, src2.y);
|
||||
float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index));
|
||||
|
||||
float2 data = src_data1 - src_data2;
|
||||
data = isMatSubScalar ? data : -data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -419,12 +419,12 @@ __kernel void arithm_s_sub_with_mask_C2_D6 (__global double *src1, int src1_st
|
||||
uchar mask_data = *(mask + mask_index);
|
||||
|
||||
double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
|
||||
double2 src_data2 = (double2)(src2.x, src2.y);
|
||||
double2 src_data2 = (double2)(src2.x, src2.y);
|
||||
double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index));
|
||||
|
||||
double2 data = src_data1 - src_data2;
|
||||
data = isMatSubScalar ? data : -data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -444,7 +444,7 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_ste
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -455,9 +455,9 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_ste
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
||||
int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
|
||||
int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
|
||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
||||
int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
@@ -478,17 +478,17 @@ __kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_ste
|
||||
uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
@@ -510,7 +510,7 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_st
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -521,9 +521,9 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_st
|
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
||||
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
@@ -545,12 +545,12 @@ __kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_st
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -572,7 +572,7 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_ste
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -583,9 +583,9 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_ste
|
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
||||
int2 src2_data_0 = (int2)(src2.x, src2.y);
|
||||
int2 src2_data_1 = (int2)(src2.z, src2.x);
|
||||
int2 src2_data_2 = (int2)(src2.y, src2.z);
|
||||
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
@@ -607,12 +607,12 @@ __kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_ste
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
@@ -631,7 +631,7 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step,
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
@@ -639,9 +639,9 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step,
|
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int src2_data_0 = src2.x;
|
||||
int src2_data_0 = src2.x;
|
||||
int src2_data_1 = src2.y;
|
||||
int src2_data_2 = src2.z;
|
||||
int src2_data_2 = src2.z;
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
@@ -652,7 +652,7 @@ __kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step,
|
||||
long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
|
||||
long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
|
||||
long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
|
||||
|
||||
|
||||
tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
|
||||
tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
|
||||
tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
|
||||
@@ -681,17 +681,17 @@ __kernel void arithm_s_sub_with_mask_C3_D5 (__global float *src1, int src1_ste
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
|
||||
float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
|
||||
float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
float src2_data_0 = src2.x;
|
||||
|
||||
float src2_data_0 = src2.x;
|
||||
float src2_data_1 = src2.y;
|
||||
float src2_data_2 = src2.z;
|
||||
float src2_data_2 = src2.z;
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
@@ -729,17 +729,17 @@ __kernel void arithm_s_sub_with_mask_C3_D6 (__global double *src1, int src1_st
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
|
||||
double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
|
||||
double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
double src2_data_0 = src2.x;
|
||||
|
||||
double src2_data_0 = src2.x;
|
||||
double src2_data_1 = src2.y;
|
||||
double src2_data_2 = src2.z;
|
||||
double src2_data_2 = src2.z;
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
@@ -789,7 +789,7 @@ __kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_ste
|
||||
tmp = isMatSubScalar ? tmp : -tmp;
|
||||
uchar4 data = convert_uchar4_sat(tmp);
|
||||
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
@@ -818,7 +818,7 @@ __kernel void arithm_s_sub_with_mask_C4_D2 (__global ushort *src1, int src1_st
|
||||
tmp = isMatSubScalar ? tmp : -tmp;
|
||||
ushort4 data = convert_ushort4_sat(tmp);
|
||||
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -847,7 +847,7 @@ __kernel void arithm_s_sub_with_mask_C4_D3 (__global short *src1, int src1_ste
|
||||
tmp = isMatSubScalar ? tmp : -tmp;
|
||||
short4 data = convert_short4_sat(tmp);
|
||||
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -876,7 +876,7 @@ __kernel void arithm_s_sub_with_mask_C4_D4 (__global int *src1, int src1_step,
|
||||
tmp = isMatSubScalar ? tmp : -tmp;
|
||||
int4 data = convert_int4_sat(tmp);
|
||||
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -904,7 +904,7 @@ __kernel void arithm_s_sub_with_mask_C4_D5 (__global float *src1, int src1_ste
|
||||
float4 data = src_data1 - src2;
|
||||
data = isMatSubScalar ? data : -data;
|
||||
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
@@ -933,7 +933,7 @@ __kernel void arithm_s_sub_with_mask_C4_D6 (__global double *src1, int src1_st
|
||||
|
||||
double4 data = src_data1 - src2;
|
||||
data = isMatSubScalar ? data : -data;
|
||||
data = mask_data ? data : dst_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global double4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
|
||||
@@ -151,7 +151,7 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
|
||||
if(id < elemnum)
|
||||
{
|
||||
temp = CONVERT_RES_TYPE(src[idx]);
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
}
|
||||
@@ -169,7 +169,7 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = CONVERT_RES_TYPE(src[idx]);
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
}
|
||||
|
||||
@@ -159,7 +159,7 @@
|
||||
#define repeat_e(a,b,c) a.s3=0; a.s2=0; a.s1=0; b=0; c=0;
|
||||
#endif
|
||||
|
||||
__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
|
||||
__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
|
||||
__global VEC_TYPE *src, __global RES_TYPE *dst)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
@@ -176,7 +176,7 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
|
||||
temp1 = CONVERT_RES_TYPE(src[idx]);
|
||||
temp2 = CONVERT_RES_TYPE(src[idx+1]);
|
||||
temp3 = CONVERT_RES_TYPE(src[idx+2]);
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp1,temp2,temp3);
|
||||
}
|
||||
@@ -201,7 +201,7 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
|
||||
temp1 = CONVERT_RES_TYPE(src[idx]);
|
||||
temp2 = CONVERT_RES_TYPE(src[idx+1]);
|
||||
temp3 = CONVERT_RES_TYPE(src[idx+2]);
|
||||
if(id % cols == 0 )
|
||||
if(id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp1,temp2,temp3);
|
||||
}
|
||||
|
||||
@@ -43,14 +43,14 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define TILE_DIM 32
|
||||
#define BLOCK_ROWS 8
|
||||
#define TILE_DIM 32
|
||||
#define BLOCK_ROWS 8
|
||||
#define LDS_STEP (TILE_DIM + 1)
|
||||
|
||||
|
||||
//8UC1 is not unoptimized, as the size of write per thread is 8
|
||||
//8UC1 is not unoptimized, as the size of write per thread is 8
|
||||
//which will use completepath
|
||||
__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
|
||||
__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
|
||||
__global uchar* dst, int dst_step, int dst_offset,
|
||||
int src_rows, int src_cols)
|
||||
{
|
||||
@@ -62,13 +62,13 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
|
||||
|
||||
if(src_rows == src_cols)
|
||||
{
|
||||
groupId_y = gp_x;
|
||||
groupId_y = gp_x;
|
||||
groupId_x = (gp_x + gp_y) % gs_x;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
|
||||
}
|
||||
|
||||
@@ -87,7 +87,7 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
|
||||
{
|
||||
int index_src = mad24(y, src_step, x);
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
{
|
||||
if(y + i < src_rows)
|
||||
@@ -109,14 +109,14 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
|
||||
{
|
||||
if((y_index + i) < src_cols)
|
||||
{
|
||||
*(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];
|
||||
*(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];
|
||||
index_dst += dst_step * BLOCK_ROWS ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
|
||||
__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
|
||||
__global int* dst, int dst_step, int dst_offset,
|
||||
int src_rows, int src_cols)
|
||||
{
|
||||
@@ -128,13 +128,13 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
|
||||
|
||||
if(src_rows == src_cols)
|
||||
{
|
||||
groupId_y = gp_x;
|
||||
groupId_y = gp_x;
|
||||
groupId_x = (gp_x + gp_y) % gs_x;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
|
||||
}
|
||||
|
||||
@@ -153,7 +153,7 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
|
||||
{
|
||||
int index_src = mad24(y, src_step, (x << 2));
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
{
|
||||
if(y + i < src_rows)
|
||||
@@ -175,13 +175,13 @@ __kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
|
||||
{
|
||||
if((y_index + i) < src_cols)
|
||||
{
|
||||
*((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
*((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
index_dst += dst_step * BLOCK_ROWS ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
|
||||
__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
|
||||
__global float* dst, int dst_step, int dst_offset,
|
||||
int src_rows, int src_cols)
|
||||
{
|
||||
@@ -193,13 +193,13 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
|
||||
|
||||
if(src_rows == src_cols)
|
||||
{
|
||||
groupId_y = gp_x;
|
||||
groupId_y = gp_x;
|
||||
groupId_x = (gp_x + gp_y) % gs_x;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
|
||||
}
|
||||
|
||||
@@ -218,7 +218,7 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
|
||||
{
|
||||
int index_src = mad24(y, src_step, (x << 2));
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
{
|
||||
if(y + i < src_rows)
|
||||
@@ -240,14 +240,14 @@ __kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
|
||||
{
|
||||
if((y_index + i) < src_cols)
|
||||
{
|
||||
*((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
*((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
index_dst += dst_step * BLOCK_ROWS ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset,
|
||||
__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset,
|
||||
__global ushort* dst, int dst_step, int dst_offset,
|
||||
int src_rows, int src_cols)
|
||||
{
|
||||
@@ -259,13 +259,13 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset
|
||||
|
||||
if(src_rows == src_cols)
|
||||
{
|
||||
groupId_y = gp_x;
|
||||
groupId_y = gp_x;
|
||||
groupId_x = (gp_x + gp_y) % gs_x;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
|
||||
}
|
||||
|
||||
@@ -284,7 +284,7 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset
|
||||
{
|
||||
int index_src = mad24(y, src_step, (x << 2));
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
{
|
||||
if(y + i < src_rows)
|
||||
@@ -306,13 +306,13 @@ __kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset
|
||||
{
|
||||
if((y_index + i) < src_cols)
|
||||
{
|
||||
*((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
*((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
index_dst += dst_step * BLOCK_ROWS ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
|
||||
__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
|
||||
__global short* dst, int dst_step, int dst_offset,
|
||||
int src_rows, int src_cols)
|
||||
{
|
||||
@@ -324,13 +324,13 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
|
||||
|
||||
if(src_rows == src_cols)
|
||||
{
|
||||
groupId_y = gp_x;
|
||||
groupId_y = gp_x;
|
||||
groupId_x = (gp_x + gp_y) % gs_x;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
|
||||
}
|
||||
|
||||
@@ -349,7 +349,7 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
|
||||
{
|
||||
int index_src = mad24(y, src_step, (x << 2));
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
{
|
||||
if(y + i < src_rows)
|
||||
@@ -371,13 +371,13 @@ __kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
|
||||
{
|
||||
if((y_index + i) < src_cols)
|
||||
{
|
||||
*((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
*((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
index_dst += dst_step * BLOCK_ROWS ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
|
||||
__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
|
||||
__global uchar* dst, int dst_step, int dst_offset,
|
||||
int src_rows, int src_cols)
|
||||
{
|
||||
@@ -389,13 +389,13 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
|
||||
|
||||
if(src_rows == src_cols)
|
||||
{
|
||||
groupId_y = gp_x;
|
||||
groupId_y = gp_x;
|
||||
groupId_x = (gp_x + gp_y) % gs_x;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
|
||||
}
|
||||
|
||||
@@ -414,7 +414,7 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
|
||||
{
|
||||
int index_src = mad24(y, src_step, (x << 2));
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
{
|
||||
if(y + i < src_rows)
|
||||
@@ -436,14 +436,14 @@ __kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
|
||||
{
|
||||
if((y_index + i) < src_cols)
|
||||
{
|
||||
*((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
*((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
index_dst += dst_step * BLOCK_ROWS ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
|
||||
__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
|
||||
__global char* dst, int dst_step, int dst_offset,
|
||||
int src_rows, int src_cols)
|
||||
{
|
||||
@@ -455,13 +455,13 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
|
||||
|
||||
if(src_rows == src_cols)
|
||||
{
|
||||
groupId_y = gp_x;
|
||||
groupId_y = gp_x;
|
||||
groupId_x = (gp_x + gp_y) % gs_x;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
int bid = gp_x + gs_x * gp_y;
|
||||
groupId_y = bid % gs_y;
|
||||
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
|
||||
}
|
||||
|
||||
@@ -480,7 +480,7 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
|
||||
{
|
||||
int index_src = mad24(y, src_step, (x << 2));
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||
{
|
||||
if(y + i < src_rows)
|
||||
@@ -502,7 +502,7 @@ __kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
|
||||
{
|
||||
if((y_index + i) < src_cols)
|
||||
{
|
||||
*((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
*((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
|
||||
index_dst += dst_step * BLOCK_ROWS ;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Liu Liujun, liujun@multicorewareinc.com
|
||||
// Liu Liujun, liujun@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@@ -43,103 +43,103 @@
|
||||
//
|
||||
//M*/
|
||||
__kernel void BlendLinear_C1_D0(
|
||||
__global uchar *dst,
|
||||
__global uchar *img1,
|
||||
__global uchar *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
__global uchar *dst,
|
||||
__global uchar *img1,
|
||||
__global uchar *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
{
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
if (idx < cols && idy < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep,idx);
|
||||
int wpos = mad24(idy,wstep,idx);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
if (idx < cols && idy < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep,idx);
|
||||
int wpos = mad24(idy,wstep,idx);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BlendLinear_C4_D0(
|
||||
__global uchar *dst,
|
||||
__global uchar *img1,
|
||||
__global uchar *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
__global uchar *dst,
|
||||
__global uchar *img1,
|
||||
__global uchar *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
{
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
int x = idx / 4;
|
||||
int y = idy;
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep,idx);
|
||||
int wpos = mad24(idy,wstep,x);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
int x = idx / 4;
|
||||
int y = idy;
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep,idx);
|
||||
int wpos = mad24(idy,wstep,x);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BlendLinear_C1_D5(
|
||||
__global float *dst,
|
||||
__global float *img1,
|
||||
__global float *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
__global float *dst,
|
||||
__global float *img1,
|
||||
__global float *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
{
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
if (idx < cols && idy < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep,idx);
|
||||
int wpos = mad24(idy,wstep,idx);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
if (idx < cols && idy < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep,idx);
|
||||
int wpos = mad24(idy,wstep,idx);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BlendLinear_C4_D5(
|
||||
__global float *dst,
|
||||
__global float *img1,
|
||||
__global float *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
__global float *dst,
|
||||
__global float *img1,
|
||||
__global float *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
{
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
int x = idx / 4;
|
||||
int y = idy;
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep,idx);
|
||||
int wpos = mad24(idy,wstep,x);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
int x = idx / 4;
|
||||
int y = idy;
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep,idx);
|
||||
int wpos = mad24(idy,wstep,x);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,237 +1,237 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
__kernel
|
||||
void buildWarpPlaneMaps
|
||||
(
|
||||
__global float * map_x,
|
||||
__global float * map_y,
|
||||
__constant float * KRT,
|
||||
int tl_u,
|
||||
int tl_v,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y,
|
||||
float scale
|
||||
)
|
||||
{
|
||||
int du = get_global_id(0);
|
||||
int dv = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
__constant float * ck_rinv = KRT;
|
||||
__constant float * ct = KRT + 9;
|
||||
|
||||
if (du < cols && dv < rows)
|
||||
{
|
||||
float u = tl_u + du;
|
||||
float v = tl_v + dv;
|
||||
float x, y;
|
||||
|
||||
float x_ = u / scale - ct[0];
|
||||
float y_ = v / scale - ct[1];
|
||||
|
||||
float z;
|
||||
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
|
||||
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
|
||||
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
|
||||
|
||||
x /= z;
|
||||
y /= z;
|
||||
|
||||
map_x[dv * step_x + du] = x;
|
||||
map_y[dv * step_y + du] = y;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void buildWarpCylindricalMaps
|
||||
(
|
||||
__global float * map_x,
|
||||
__global float * map_y,
|
||||
__constant float * ck_rinv,
|
||||
int tl_u,
|
||||
int tl_v,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y,
|
||||
float scale
|
||||
)
|
||||
{
|
||||
int du = get_global_id(0);
|
||||
int dv = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
if (du < cols && dv < rows)
|
||||
{
|
||||
float u = tl_u + du;
|
||||
float v = tl_v + dv;
|
||||
float x, y;
|
||||
|
||||
u /= scale;
|
||||
float x_ = sin(u);
|
||||
float y_ = v / scale;
|
||||
float z_ = cos(u);
|
||||
|
||||
float z;
|
||||
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
|
||||
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
|
||||
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
|
||||
|
||||
if (z > 0) { x /= z; y /= z; }
|
||||
else x = y = -1;
|
||||
|
||||
map_x[dv * step_x + du] = x;
|
||||
map_y[dv * step_y + du] = y;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void buildWarpSphericalMaps
|
||||
(
|
||||
__global float * map_x,
|
||||
__global float * map_y,
|
||||
__constant float * ck_rinv,
|
||||
int tl_u,
|
||||
int tl_v,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y,
|
||||
float scale
|
||||
)
|
||||
{
|
||||
int du = get_global_id(0);
|
||||
int dv = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
if (du < cols && dv < rows)
|
||||
{
|
||||
float u = tl_u + du;
|
||||
float v = tl_v + dv;
|
||||
float x, y;
|
||||
|
||||
v /= scale;
|
||||
u /= scale;
|
||||
|
||||
float sinv = sin(v);
|
||||
float x_ = sinv * sin(u);
|
||||
float y_ = - cos(v);
|
||||
float z_ = sinv * cos(u);
|
||||
|
||||
float z;
|
||||
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
|
||||
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
|
||||
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
|
||||
|
||||
if (z > 0) { x /= z; y /= z; }
|
||||
else x = y = -1;
|
||||
|
||||
map_x[dv * step_x + du] = x;
|
||||
map_y[dv * step_y + du] = y;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void buildWarpAffineMaps
|
||||
(
|
||||
__global float * xmap,
|
||||
__global float * ymap,
|
||||
__constant float * c_warpMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
|
||||
const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
|
||||
|
||||
map_x[y * step_x + x] = xcoo;
|
||||
map_y[y * step_y + x] = ycoo;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void buildWarpPerspectiveMaps
|
||||
(
|
||||
__global float * xmap,
|
||||
__global float * ymap,
|
||||
__constant float * c_warpMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
|
||||
|
||||
const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
|
||||
const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
|
||||
|
||||
map_x[y * step_x + x] = xcoo;
|
||||
map_y[y * step_y + x] = ycoo;
|
||||
}
|
||||
}
|
||||
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
__kernel
|
||||
void buildWarpPlaneMaps
|
||||
(
|
||||
__global float * map_x,
|
||||
__global float * map_y,
|
||||
__constant float * KRT,
|
||||
int tl_u,
|
||||
int tl_v,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y,
|
||||
float scale
|
||||
)
|
||||
{
|
||||
int du = get_global_id(0);
|
||||
int dv = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
__constant float * ck_rinv = KRT;
|
||||
__constant float * ct = KRT + 9;
|
||||
|
||||
if (du < cols && dv < rows)
|
||||
{
|
||||
float u = tl_u + du;
|
||||
float v = tl_v + dv;
|
||||
float x, y;
|
||||
|
||||
float x_ = u / scale - ct[0];
|
||||
float y_ = v / scale - ct[1];
|
||||
|
||||
float z;
|
||||
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
|
||||
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
|
||||
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
|
||||
|
||||
x /= z;
|
||||
y /= z;
|
||||
|
||||
map_x[dv * step_x + du] = x;
|
||||
map_y[dv * step_y + du] = y;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void buildWarpCylindricalMaps
|
||||
(
|
||||
__global float * map_x,
|
||||
__global float * map_y,
|
||||
__constant float * ck_rinv,
|
||||
int tl_u,
|
||||
int tl_v,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y,
|
||||
float scale
|
||||
)
|
||||
{
|
||||
int du = get_global_id(0);
|
||||
int dv = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
if (du < cols && dv < rows)
|
||||
{
|
||||
float u = tl_u + du;
|
||||
float v = tl_v + dv;
|
||||
float x, y;
|
||||
|
||||
u /= scale;
|
||||
float x_ = sin(u);
|
||||
float y_ = v / scale;
|
||||
float z_ = cos(u);
|
||||
|
||||
float z;
|
||||
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
|
||||
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
|
||||
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
|
||||
|
||||
if (z > 0) { x /= z; y /= z; }
|
||||
else x = y = -1;
|
||||
|
||||
map_x[dv * step_x + du] = x;
|
||||
map_y[dv * step_y + du] = y;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void buildWarpSphericalMaps
|
||||
(
|
||||
__global float * map_x,
|
||||
__global float * map_y,
|
||||
__constant float * ck_rinv,
|
||||
int tl_u,
|
||||
int tl_v,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y,
|
||||
float scale
|
||||
)
|
||||
{
|
||||
int du = get_global_id(0);
|
||||
int dv = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
if (du < cols && dv < rows)
|
||||
{
|
||||
float u = tl_u + du;
|
||||
float v = tl_v + dv;
|
||||
float x, y;
|
||||
|
||||
v /= scale;
|
||||
u /= scale;
|
||||
|
||||
float sinv = sin(v);
|
||||
float x_ = sinv * sin(u);
|
||||
float y_ = - cos(v);
|
||||
float z_ = sinv * cos(u);
|
||||
|
||||
float z;
|
||||
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
|
||||
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
|
||||
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
|
||||
|
||||
if (z > 0) { x /= z; y /= z; }
|
||||
else x = y = -1;
|
||||
|
||||
map_x[dv * step_x + du] = x;
|
||||
map_y[dv * step_y + du] = y;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void buildWarpAffineMaps
|
||||
(
|
||||
__global float * xmap,
|
||||
__global float * ymap,
|
||||
__constant float * c_warpMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
|
||||
const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
|
||||
|
||||
map_x[y * step_x + x] = xcoo;
|
||||
map_y[y * step_y + x] = ycoo;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void buildWarpPerspectiveMaps
|
||||
(
|
||||
__global float * xmap,
|
||||
__global float * ymap,
|
||||
__constant float * c_warpMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int step_x,
|
||||
int step_y
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
step_x /= sizeof(float);
|
||||
step_y /= sizeof(float);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
|
||||
|
||||
const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
|
||||
const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
|
||||
|
||||
map_x[y * step_x + x] = xcoo;
|
||||
map_y[y * step_y + x] = ycoo;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -36,106 +36,106 @@
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
|
||||
int dstStep_in_piexl,int pixel_end)
|
||||
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
|
||||
int dstStep_in_piexl,int pixel_end)
|
||||
{
|
||||
int id = get_global_id(0);
|
||||
//int pixel_end = mul24(cols -1 , rows -1);
|
||||
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
|
||||
pixelid = clamp(pixelid,0,pixel_end);
|
||||
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
|
||||
pixel0 = src[pixelid.x];
|
||||
pixel1 = src[pixelid.y];
|
||||
pixel2 = src[pixelid.z];
|
||||
int id = get_global_id(0);
|
||||
//int pixel_end = mul24(cols -1 , rows -1);
|
||||
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
|
||||
pixelid = clamp(pixelid,0,pixel_end);
|
||||
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
|
||||
pixel0 = src[pixelid.x];
|
||||
pixel1 = src[pixelid.y];
|
||||
pixel2 = src[pixelid.z];
|
||||
|
||||
|
||||
outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
|
||||
outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
|
||||
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
|
||||
outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
|
||||
outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
|
||||
outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
|
||||
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
|
||||
outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
|
||||
|
||||
int4 outy = (id<<2)/cols;
|
||||
int4 outx = (id<<2)%cols;
|
||||
outx.y++;
|
||||
outx.z+=2;
|
||||
outx.w+=3;
|
||||
outy = select(outy,outy+1,outx>=cols);
|
||||
outx = select(outx,outx-cols,outx>=cols);
|
||||
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
|
||||
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
|
||||
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
|
||||
//outx = select(outx,(int4)outx.x,outy>=rows);
|
||||
//outy = select(outy,(int4)outy.x,outy>=rows);
|
||||
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
|
||||
if(outx.w<cols && outy.w<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
dst[addr.y] = outpix1;
|
||||
dst[addr.z] = outpix2;
|
||||
dst[addr.w] = outpix3;
|
||||
}
|
||||
else if(outx.z<cols && outy.z<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
dst[addr.y] = outpix1;
|
||||
dst[addr.z] = outpix2;
|
||||
}
|
||||
else if(outx.y<cols && outy.y<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
dst[addr.y] = outpix1;
|
||||
}
|
||||
else if(outx.x<cols && outy.x<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
}
|
||||
int4 outy = (id<<2)/cols;
|
||||
int4 outx = (id<<2)%cols;
|
||||
outx.y++;
|
||||
outx.z+=2;
|
||||
outx.w+=3;
|
||||
outy = select(outy,outy+1,outx>=cols);
|
||||
outx = select(outx,outx-cols,outx>=cols);
|
||||
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
|
||||
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
|
||||
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
|
||||
//outx = select(outx,(int4)outx.x,outy>=rows);
|
||||
//outy = select(outy,(int4)outy.x,outy>=rows);
|
||||
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
|
||||
if(outx.w<cols && outy.w<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
dst[addr.y] = outpix1;
|
||||
dst[addr.z] = outpix2;
|
||||
dst[addr.w] = outpix3;
|
||||
}
|
||||
else if(outx.z<cols && outy.z<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
dst[addr.y] = outpix1;
|
||||
dst[addr.z] = outpix2;
|
||||
}
|
||||
else if(outx.y<cols && outy.y<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
dst[addr.y] = outpix1;
|
||||
}
|
||||
else if(outx.x<cols && outy.x<rows)
|
||||
{
|
||||
dst[addr.x] = outpix0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
|
||||
int srcStep_in_pixel,int pixel_end)
|
||||
__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
|
||||
int srcStep_in_pixel,int pixel_end)
|
||||
{
|
||||
int id = get_global_id(0)<<2;
|
||||
int y = id / cols;
|
||||
int x = id % cols;
|
||||
int4 x4 = (int4)(x,x+1,x+2,x+3);
|
||||
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
|
||||
y4=clamp(y4,(int4)0,(int4)(rows-1));
|
||||
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
|
||||
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
|
||||
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
|
||||
pixel0 = src[addr.x];
|
||||
pixel1 = src[addr.y];
|
||||
pixel2 = src[addr.z];
|
||||
pixel3 = src[addr.w];
|
||||
int id = get_global_id(0)<<2;
|
||||
int y = id / cols;
|
||||
int x = id % cols;
|
||||
int4 x4 = (int4)(x,x+1,x+2,x+3);
|
||||
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
|
||||
y4=clamp(y4,(int4)0,(int4)(rows-1));
|
||||
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
|
||||
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
|
||||
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
|
||||
pixel0 = src[addr.x];
|
||||
pixel1 = src[addr.y];
|
||||
pixel2 = src[addr.z];
|
||||
pixel3 = src[addr.w];
|
||||
|
||||
pixel0.w = pixel1.x;
|
||||
outpixel1.x = pixel1.y;
|
||||
outpixel1.y = pixel1.z;
|
||||
outpixel1.z = pixel2.x;
|
||||
outpixel1.w = pixel2.y;
|
||||
outpixel2.x = pixel2.z;
|
||||
outpixel2.y = pixel3.x;
|
||||
outpixel2.z = pixel3.y;
|
||||
outpixel2.w = pixel3.z;
|
||||
int4 outaddr = mul24(id>>2 , 3);
|
||||
outaddr.y++;
|
||||
outaddr.z+=2;
|
||||
if(outaddr.z <= pixel_end)
|
||||
{
|
||||
dst[outaddr.x] = pixel0;
|
||||
dst[outaddr.y] = outpixel1;
|
||||
dst[outaddr.z] = outpixel2;
|
||||
}
|
||||
else if(outaddr.y <= pixel_end)
|
||||
{
|
||||
dst[outaddr.x] = pixel0;
|
||||
dst[outaddr.y] = outpixel1;
|
||||
}
|
||||
else if(outaddr.x <= pixel_end)
|
||||
{
|
||||
dst[outaddr.x] = pixel0;
|
||||
}
|
||||
pixel0.w = pixel1.x;
|
||||
outpixel1.x = pixel1.y;
|
||||
outpixel1.y = pixel1.z;
|
||||
outpixel1.z = pixel2.x;
|
||||
outpixel1.w = pixel2.y;
|
||||
outpixel2.x = pixel2.z;
|
||||
outpixel2.y = pixel3.x;
|
||||
outpixel2.z = pixel3.y;
|
||||
outpixel2.w = pixel3.z;
|
||||
int4 outaddr = mul24(id>>2 , 3);
|
||||
outaddr.y++;
|
||||
outaddr.z+=2;
|
||||
if(outaddr.z <= pixel_end)
|
||||
{
|
||||
dst[outaddr.x] = pixel0;
|
||||
dst[outaddr.y] = outpixel1;
|
||||
dst[outaddr.z] = outpixel2;
|
||||
}
|
||||
else if(outaddr.y <= pixel_end)
|
||||
{
|
||||
dst[outaddr.x] = pixel0;
|
||||
dst[outaddr.y] = outpixel1;
|
||||
}
|
||||
else if(outaddr.x <= pixel_end)
|
||||
{
|
||||
dst[outaddr.x] = pixel0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,4 +78,4 @@ __kernel void RGB2Gray(int cols,int rows,int src_step,int dst_step,int channels,
|
||||
int dst_idx = y * dst_step + x * sizeof(DATA_TYPE);
|
||||
dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,7 +83,7 @@ Now(6/29/2011) the kernels only support 8U data type and the anchor of the convo
|
||||
kernel must be in the center. ROI is not supported either.
|
||||
Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
|
||||
from LDS to calculate the result.
|
||||
The length of the convovle kernel supported is only related to the MAX size of LDS,
|
||||
The length of the convovle kernel supported is only related to the MAX size of LDS,
|
||||
which is HW related.
|
||||
Niko
|
||||
6/29/2011
|
||||
@@ -92,56 +92,56 @@ The info above maybe obsolete.
|
||||
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
|
||||
(__global const GENTYPE_SRC * restrict src,
|
||||
__global GENTYPE_DST * dst,
|
||||
(__global const GENTYPE_SRC * restrict src,
|
||||
__global GENTYPE_DST * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
//const int src_offset_x,
|
||||
//const int src_offset_y,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
//const int src_offset_x,
|
||||
//const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int dst_offset_in_pixel,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_addr = mad24(y,src_step_in_pixel,x);
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
int i;
|
||||
GENTYPE_SRC sum;
|
||||
GENTYPE_SRC temp[READ_TIMES_COL];
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_addr = mad24(y,src_step_in_pixel,x);
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
int i;
|
||||
GENTYPE_SRC sum;
|
||||
GENTYPE_SRC temp[READ_TIMES_COL];
|
||||
|
||||
__local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
|
||||
__local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
|
||||
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
|
||||
current_addr = current_addr < end_addr ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//read pixels from lds and calculate the result
|
||||
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
|
||||
for(i=1;i<=RADIUSY;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
|
||||
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
|
||||
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
dst[start_addr] = convert_to_DST(sum);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
|
||||
current_addr = current_addr < end_addr ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_COL;i++)
|
||||
{
|
||||
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//read pixels from lds and calculate the result
|
||||
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
|
||||
for(i=1;i<=RADIUSY;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
|
||||
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
|
||||
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
dst[start_addr] = convert_to_DST(sum);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,7 +83,7 @@ These kernels are written for separable filters such as Sobel, Scharr, GaussianB
|
||||
Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
|
||||
kernel must be in the center. ROI is not supported either.
|
||||
For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3,
|
||||
the kernel read 4 pixels, save them to LDS and read the data needed from LDS to
|
||||
the kernel read 4 pixels, save them to LDS and read the data needed from LDS to
|
||||
calculate the result.
|
||||
The length of the convovle kernel supported is related to the LSIZE0 and the MAX size
|
||||
of LDS, which is HW related.
|
||||
@@ -96,375 +96,375 @@ The info above maybe obsolete.
|
||||
***********************************************************************************/
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
|
||||
(__global const uchar * restrict src,
|
||||
__global float * dst,
|
||||
(__global const uchar * restrict src,
|
||||
__global float * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
{
|
||||
int x = get_global_id(0)<<2;
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
|
||||
int offset = src_offset_x-RADIUSX & 3;
|
||||
int start_y = y+src_offset_y-radiusy;
|
||||
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
|
||||
int i;
|
||||
float4 sum;
|
||||
uchar4 temp[READ_TIMES_ROW];
|
||||
int x = get_global_id(0)<<2;
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
|
||||
int offset = src_offset_x-RADIUSX & 3;
|
||||
int start_y = y+src_offset_y-radiusy;
|
||||
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
|
||||
int i;
|
||||
float4 sum;
|
||||
uchar4 temp[READ_TIMES_ROW];
|
||||
|
||||
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0*4;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = *(__global uchar4*)&src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
|
||||
temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
|
||||
temp[i].z= ELEM(start_x+i*LSIZE0*4+2,0,src_whole_cols,0,temp[i].z);
|
||||
temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
|
||||
}
|
||||
#else
|
||||
int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
|
||||
int4 index[READ_TIMES_ROW];
|
||||
int4 addr;
|
||||
int s_y;
|
||||
if(not_all_in_range)
|
||||
{
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
|
||||
index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
|
||||
index[i].y= ADDR_L(start_x+i*LSIZE0*4+1,0,src_whole_cols,start_x+i*LSIZE0*4+1);
|
||||
index[i].y= ADDR_R(start_x+i*LSIZE0*4+1,src_whole_cols,index[i].y);
|
||||
index[i].z= ADDR_L(start_x+i*LSIZE0*4+2,0,src_whole_cols,start_x+i*LSIZE0*4+2);
|
||||
index[i].z= ADDR_R(start_x+i*LSIZE0*4+2,src_whole_cols,index[i].z);
|
||||
index[i].w= ADDR_L(start_x+i*LSIZE0*4+3,0,src_whole_cols,start_x+i*LSIZE0*4+3);
|
||||
index[i].w= ADDR_R(start_x+i*LSIZE0*4+3,src_whole_cols,index[i].w);
|
||||
}
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
|
||||
temp[i].x = src[addr.x];
|
||||
temp[i].y = src[addr.y];
|
||||
temp[i].z = src[addr.z];
|
||||
temp[i].w = src[addr.w];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0*4;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = *(__global uchar4*)&src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
|
||||
temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
|
||||
temp[i].z= ELEM(start_x+i*LSIZE0*4+2,0,src_whole_cols,0,temp[i].z);
|
||||
temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
|
||||
}
|
||||
#else
|
||||
int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
|
||||
int4 index[READ_TIMES_ROW];
|
||||
int4 addr;
|
||||
int s_y;
|
||||
if(not_all_in_range)
|
||||
{
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
|
||||
index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
|
||||
index[i].y= ADDR_L(start_x+i*LSIZE0*4+1,0,src_whole_cols,start_x+i*LSIZE0*4+1);
|
||||
index[i].y= ADDR_R(start_x+i*LSIZE0*4+1,src_whole_cols,index[i].y);
|
||||
index[i].z= ADDR_L(start_x+i*LSIZE0*4+2,0,src_whole_cols,start_x+i*LSIZE0*4+2);
|
||||
index[i].z= ADDR_R(start_x+i*LSIZE0*4+2,src_whole_cols,index[i].z);
|
||||
index[i].w= ADDR_L(start_x+i*LSIZE0*4+3,0,src_whole_cols,start_x+i*LSIZE0*4+3);
|
||||
index[i].w= ADDR_R(start_x+i*LSIZE0*4+3,src_whole_cols,index[i].w);
|
||||
}
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
|
||||
temp[i].x = src[addr.x];
|
||||
temp[i].y = src[addr.y];
|
||||
temp[i].z = src[addr.z];
|
||||
temp[i].w = src[addr.w];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//read pixels from lds and calculate the result
|
||||
sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
{
|
||||
temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
|
||||
temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
|
||||
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
|
||||
}
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
//write the result to dst
|
||||
if((x+3<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
*(__global float4*)&dst[start_addr] = sum;
|
||||
}
|
||||
else if((x+2<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
dst[start_addr] = sum.x;
|
||||
dst[start_addr+1] = sum.y;
|
||||
dst[start_addr+2] = sum.z;
|
||||
}
|
||||
else if((x+1<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
dst[start_addr] = sum.x;
|
||||
dst[start_addr+1] = sum.y;
|
||||
}
|
||||
else if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
dst[start_addr] = sum.x;
|
||||
}
|
||||
//read pixels from lds and calculate the result
|
||||
sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
{
|
||||
temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
|
||||
temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
|
||||
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
|
||||
}
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
//write the result to dst
|
||||
if((x+3<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
*(__global float4*)&dst[start_addr] = sum;
|
||||
}
|
||||
else if((x+2<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
dst[start_addr] = sum.x;
|
||||
dst[start_addr+1] = sum.y;
|
||||
dst[start_addr+2] = sum.z;
|
||||
}
|
||||
else if((x+1<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
dst[start_addr] = sum.x;
|
||||
dst[start_addr+1] = sum.y;
|
||||
}
|
||||
else if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
dst[start_addr] = sum.x;
|
||||
}
|
||||
}
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
|
||||
(__global const uchar4 * restrict src,
|
||||
__global float4 * dst,
|
||||
(__global const uchar4 * restrict src,
|
||||
__global float4 * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int start_y = y+src_offset_y-radiusy;
|
||||
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
|
||||
int i;
|
||||
float4 sum;
|
||||
uchar4 temp[READ_TIMES_ROW];
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int start_y = y+src_offset_y-radiusy;
|
||||
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
|
||||
int i;
|
||||
float4 sum;
|
||||
uchar4 temp[READ_TIMES_ROW];
|
||||
|
||||
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
|
||||
}
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
|
||||
}
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//read pixels from lds and calculate the result
|
||||
sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
//read pixels from lds and calculate the result
|
||||
sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
|
||||
(__global const float * restrict src,
|
||||
__global float * dst,
|
||||
(__global const float * restrict src,
|
||||
__global float * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int start_y = y+src_offset_y-radiusy;
|
||||
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
|
||||
int i;
|
||||
float sum;
|
||||
float temp[READ_TIMES_ROW];
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int start_y = y+src_offset_y-radiusy;
|
||||
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
|
||||
int i;
|
||||
float sum;
|
||||
float temp[READ_TIMES_ROW];
|
||||
|
||||
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
|
||||
}
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
|
||||
}
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//read pixels from lds and calculate the result
|
||||
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
//read pixels from lds and calculate the result
|
||||
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
|
||||
(__global const float4 * restrict src,
|
||||
__global float4 * dst,
|
||||
(__global const float4 * restrict src,
|
||||
__global float4 * dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_rows,
|
||||
const int src_whole_cols,
|
||||
const int src_whole_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_x,
|
||||
const int src_offset_y,
|
||||
const int dst_step_in_pixel,
|
||||
const int radiusy,
|
||||
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int start_y = y+src_offset_y-radiusy;
|
||||
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
|
||||
int i;
|
||||
float4 sum;
|
||||
float4 temp[READ_TIMES_ROW];
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int start_y = y+src_offset_y-radiusy;
|
||||
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
|
||||
int i;
|
||||
float4 sum;
|
||||
float4 temp[READ_TIMES_ROW];
|
||||
|
||||
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
|
||||
}
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
|
||||
#ifdef BORDER_CONSTANT
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
int current_addr = start_addr+i*LSIZE0;
|
||||
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
|
||||
temp[i] = src[current_addr];
|
||||
}
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
|
||||
temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
|
||||
}
|
||||
#else
|
||||
int index[READ_TIMES_ROW];
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
|
||||
s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
|
||||
s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
|
||||
s_y= ADDR_R(start_y,src_whole_rows,s_y);
|
||||
index[i]=mad24(s_y,src_step_in_pixel,s_x);
|
||||
}
|
||||
//read pixels from src
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
temp[i] = src[index[i]];
|
||||
}
|
||||
#endif
|
||||
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//save pixels to lds
|
||||
for(i = 0;i<READ_TIMES_ROW;i++)
|
||||
{
|
||||
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//read pixels from lds and calculate the result
|
||||
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
//read pixels from lds and calculate the result
|
||||
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
|
||||
for(i=1;i<=RADIUSX;i++)
|
||||
{
|
||||
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
|
||||
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
|
||||
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
|
||||
}
|
||||
//write the result to dst
|
||||
if((x<dst_cols) & (y<dst_rows))
|
||||
{
|
||||
start_addr = mad24(y,dst_step_in_pixel,x);
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -50,8 +50,8 @@
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
@@ -103,12 +103,12 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
|
||||
|
||||
uint4 data[ksY+1];
|
||||
__local uint4 temp[(THREADS<<1)];
|
||||
|
||||
__local uint4 temp[(THREADS<<1)];
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
|
||||
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
|
||||
@@ -126,15 +126,15 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
int not_all_in_range;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
|
||||
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
|
||||
| (startY+i<0) | (startY+i>src_whole_rows-1);
|
||||
if(not_all_in_range)
|
||||
{
|
||||
{
|
||||
int selected_row;
|
||||
int4 selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
@@ -142,13 +142,13 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
|
||||
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
|
||||
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
|
||||
|
||||
|
||||
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
|
||||
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
|
||||
|
||||
|
||||
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
|
||||
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
|
||||
|
||||
|
||||
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
|
||||
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
|
||||
|
||||
@@ -174,7 +174,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
temp[col] = sum1;
|
||||
temp[col+THREADS] = sum2;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
if(col >= anX && col < (THREADS-ksX+anX+1))
|
||||
{
|
||||
int posX = dst_startX - dst_x_off + (col-anX)*4;
|
||||
@@ -189,7 +189,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
{
|
||||
tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i);
|
||||
}
|
||||
|
||||
|
||||
if(posY < dst_rows && posX < dst_cols)
|
||||
{
|
||||
if(posX >= 0 && posX < dst_cols)
|
||||
@@ -200,7 +200,7 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha;
|
||||
if(posX+3 >= 0 && posX+3 < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha;
|
||||
}
|
||||
}
|
||||
if(posY+1 < dst_rows && posX < dst_cols)
|
||||
{
|
||||
dst_startY+=1;
|
||||
@@ -212,9 +212,9 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha;
|
||||
if(posX+3 >= 0 && posX+3 < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -237,12 +237,12 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
//int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
//int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
|
||||
|
||||
int end_addr = src_whole_cols-4;
|
||||
int end_addr = src_whole_cols-4;
|
||||
uint4 data[ksY+1];
|
||||
__local uint4 temp[2][THREADS];
|
||||
__local uint4 temp[2][THREADS];
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool con;
|
||||
uint4 ss;
|
||||
@@ -250,12 +250,12 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
|
||||
//ss = convert_uint4(src[cur_addr]);
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
|
||||
//ss = convert_uint4(src[cur_addr]);
|
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
if(con)
|
||||
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);
|
||||
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);
|
||||
|
||||
data[i] = con ? ss : 0;
|
||||
}
|
||||
@@ -269,11 +269,11 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
|
||||
|
||||
|
||||
|
||||
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
uint4 sum0 = 0, sum1 = 0, sum2 = 0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
@@ -290,7 +290,7 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
|
||||
col += anX;
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gY << 1);
|
||||
|
||||
|
||||
uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)};
|
||||
for(int k=0; k<2; k++)
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
@@ -298,11 +298,11 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
|
||||
tmp_sum[k] += temp[k][col+i];
|
||||
}
|
||||
for(int i=0; i<2; i++)
|
||||
{
|
||||
{
|
||||
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
|
||||
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -326,21 +326,21 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
|
||||
float data[ksY+1];
|
||||
__local float temp[2][THREADS];
|
||||
__local float temp[2][THREADS];
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool con;
|
||||
float ss;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
|
||||
//ss = src[cur_addr];
|
||||
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
|
||||
//ss = src[cur_addr];
|
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
//ss = src[(startY+i)*(src_step>>2) + cur_col];
|
||||
//ss = src[(startY+i)*(src_step>>2) + cur_col];
|
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:0;
|
||||
|
||||
data[i] = con ? ss : 0.f;
|
||||
@@ -355,10 +355,10 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
|
||||
|
||||
data[i] = src[selected_row * (src_step>>2) + selected_col];
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
@@ -375,7 +375,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
|
||||
col += anX;
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gY << 1);
|
||||
|
||||
|
||||
float tmp_sum[2]={0.0, 0.0};
|
||||
for(int k=0; k<2; k++)
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
@@ -383,11 +383,11 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
|
||||
tmp_sum[k] += temp[k][col+i];
|
||||
}
|
||||
for(int i=0; i<2; i++)
|
||||
{
|
||||
{
|
||||
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
|
||||
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -411,21 +411,21 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
|
||||
float4 data[ksY+1];
|
||||
__local float4 temp[2][THREADS];
|
||||
__local float4 temp[2][THREADS];
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool con;
|
||||
float4 ss;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);
|
||||
//ss = src[cur_addr];
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);
|
||||
//ss = src[cur_addr];
|
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
//ss = src[(startY+i)*(src_step>>4) + cur_col];
|
||||
//ss = src[(startY+i)*(src_step>>4) + cur_col];
|
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:0;
|
||||
|
||||
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
|
||||
@@ -440,10 +440,10 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
|
||||
|
||||
data[i] = src[selected_row * (src_step>>4) + selected_col];
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
@@ -460,7 +460,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
|
||||
col += anX;
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gY << 1);
|
||||
|
||||
|
||||
float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
|
||||
for(int k=0; k<2; k++)
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
@@ -468,10 +468,10 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
|
||||
tmp_sum[k] += temp[k][col+i];
|
||||
}
|
||||
for(int i=0; i<2; i++)
|
||||
{
|
||||
{
|
||||
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
|
||||
dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,8 +51,8 @@
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
@@ -90,10 +90,10 @@
|
||||
#define ROWS_PER_GROUP_BITS 2
|
||||
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
|
||||
|
||||
#define THREADS_PER_ROW 64
|
||||
#define THREADS_PER_ROW_BIT 6
|
||||
#define THREADS_PER_ROW 64
|
||||
#define THREADS_PER_ROW_BIT 6
|
||||
|
||||
#define ELEMENTS_PER_THREAD 4
|
||||
#define ELEMENTS_PER_THREAD 4
|
||||
#define ELEMENTS_PER_THREAD_BIT 2
|
||||
|
||||
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
|
||||
@@ -101,10 +101,10 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
@@ -114,16 +114,16 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
__local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH];
|
||||
if((gY << 2) < rows)
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH; ++i)
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
@@ -132,7 +132,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
uchar data = *(src + selected_row * src_step + selected_cols);
|
||||
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
@@ -141,7 +141,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
data = *(src + selected_row * src_step + selected_cols);
|
||||
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
@@ -152,7 +152,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
|
||||
uchar data = *(src + selected_row * src_step + selected_cols);
|
||||
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
@@ -160,7 +160,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = *(src + selected_row * src_step + selected_cols);
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -171,9 +171,9 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
|
||||
if(((gY << 2) < rows) && (process_col < operate_cols))
|
||||
{
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_end = dst_offset_x + cols;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
|
||||
int dst_rows_end = dst_offset_y + rows;
|
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
|
||||
@@ -191,9 +191,9 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
|
||||
}
|
||||
}
|
||||
@@ -205,17 +205,17 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
|
||||
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
|
||||
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
|
||||
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
|
||||
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global float *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global float *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
@@ -225,16 +225,16 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
__local float local_data[LOCAL_MEM_STEP * ROWS_FETCH];
|
||||
if(((gY << 2) < rows))
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH; ++i)
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
@@ -243,7 +243,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
@@ -252,7 +252,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
@@ -262,7 +262,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
|
||||
|
||||
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
local_data[i * LOCAL_MEM_STEP + lX] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
@@ -270,7 +270,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -281,9 +281,9 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
|
||||
if(((gY << 2) < rows) && (process_col < operate_cols))
|
||||
{
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_end = dst_offset_x + cols;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
|
||||
int dst_rows_end = dst_offset_y + rows;
|
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
|
||||
@@ -301,9 +301,9 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * data);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +316,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
|
||||
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
|
||||
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -324,10 +324,10 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
@@ -337,17 +337,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
__local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH];
|
||||
|
||||
|
||||
if(((gY << 2) < rows))
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH; ++i)
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
@@ -356,7 +356,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
@@ -365,7 +365,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
@@ -376,7 +376,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
|
||||
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
|
||||
local_data[i * LOCAL_MEM_STEP + lX] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
@@ -384,7 +384,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -395,9 +395,9 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
|
||||
if(((gY << 2) < rows) && (process_col < operate_cols))
|
||||
{
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_end = dst_offset_x + cols;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
|
||||
int dst_rows_end = dst_offset_y + rows;
|
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
|
||||
@@ -416,9 +416,9 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
|
||||
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
|
||||
}
|
||||
}
|
||||
@@ -427,16 +427,16 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
uchar16 sum1 = convert_uchar16_sat(sum);
|
||||
sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?
|
||||
sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?
|
||||
sum1.s0123 : dst_data.s0123;
|
||||
sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))?
|
||||
sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))?
|
||||
sum1.s4567 : dst_data.s4567;
|
||||
sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))?
|
||||
sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))?
|
||||
sum1.s89ab : dst_data.s89ab;
|
||||
sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))?
|
||||
sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))?
|
||||
sum1.scdef : dst_data.scdef;
|
||||
|
||||
*((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1;
|
||||
*((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -445,10 +445,10 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define ROWS_FETCH_C4 (1 + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
|
||||
#define LOCAL_MEM_STEP_C4 260 //divup((get_local_size(0) + anX * 2), 4) * 4)
|
||||
__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
@@ -458,15 +458,15 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + gY - ANY;
|
||||
|
||||
int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + gY - ANY;
|
||||
|
||||
__local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4];
|
||||
if((gY < rows) && (gX < (operate_cols + ANX + ANX)))
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH_C4; ++i)
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
@@ -475,7 +475,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
@@ -484,7 +484,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
@@ -494,7 +494,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
|
||||
|
||||
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
local_data[i * LOCAL_MEM_STEP_C4 + lX] =data;
|
||||
local_data[i * LOCAL_MEM_STEP_C4 + lX] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
@@ -502,7 +502,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
|
||||
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -512,7 +512,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
|
||||
if((gY < rows) && (gX < operate_cols))
|
||||
{
|
||||
int dst_cols_index = dst_offset_x + gX;
|
||||
int dst_cols_index = dst_offset_x + gX;
|
||||
int dst_rows_index = dst_offset_y + gY;
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
@@ -521,11 +521,11 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_
|
||||
{
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
int local_cols = lX + j;
|
||||
int local_cols = lX + j;
|
||||
sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols];
|
||||
}
|
||||
}
|
||||
|
||||
*((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum;
|
||||
*((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,160 +45,160 @@
|
||||
#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
|
||||
#ifndef GENTYPE
|
||||
__kernel void morph_C1_D0(__global const uchar * restrict src,
|
||||
__global uchar *dst,
|
||||
int src_offset_x, int src_offset_y,
|
||||
int cols, int rows,
|
||||
int src_step_in_pixel, int dst_step_in_pixel,
|
||||
__constant uchar * mat_kernel,
|
||||
int src_whole_cols, int src_whole_rows,
|
||||
int dst_offset_in_pixel)
|
||||
__global uchar *dst,
|
||||
int src_offset_x, int src_offset_y,
|
||||
int cols, int rows,
|
||||
int src_step_in_pixel, int dst_step_in_pixel,
|
||||
__constant uchar * mat_kernel,
|
||||
int src_whole_cols, int src_whole_rows,
|
||||
int dst_offset_in_pixel)
|
||||
{
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int x = get_group_id(0)*4*LSIZE0;
|
||||
int y = get_group_id(1)*LSIZE1;
|
||||
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
|
||||
int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
|
||||
int width = (end_x -start_x+4)>>2;
|
||||
int offset = src_offset_x-RADIUSX & 3;
|
||||
int start_y = y+src_offset_y-RADIUSY;
|
||||
int point1 = mad24(l_y,LSIZE0,l_x);
|
||||
int point2 = point1 + LSIZE0*LSIZE1;
|
||||
int tl_x = (point1 % width)<<2;
|
||||
int tl_y = point1 / width;
|
||||
int tl_x2 = (point2 % width)<<2;
|
||||
int tl_y2 = point2 / width;
|
||||
int cur_x = start_x + tl_x;
|
||||
int cur_y = start_y + tl_y;
|
||||
int cur_x2 = start_x + tl_x2;
|
||||
int cur_y2 = start_y + tl_y2;
|
||||
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
|
||||
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
|
||||
uchar4 temp0,temp1;
|
||||
__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int x = get_group_id(0)*4*LSIZE0;
|
||||
int y = get_group_id(1)*LSIZE1;
|
||||
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
|
||||
int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
|
||||
int width = (end_x -start_x+4)>>2;
|
||||
int offset = src_offset_x-RADIUSX & 3;
|
||||
int start_y = y+src_offset_y-RADIUSY;
|
||||
int point1 = mad24(l_y,LSIZE0,l_x);
|
||||
int point2 = point1 + LSIZE0*LSIZE1;
|
||||
int tl_x = (point1 % width)<<2;
|
||||
int tl_y = point1 / width;
|
||||
int tl_x2 = (point2 % width)<<2;
|
||||
int tl_y2 = point2 / width;
|
||||
int cur_x = start_x + tl_x;
|
||||
int cur_y = start_y + tl_y;
|
||||
int cur_x2 = start_x + tl_x2;
|
||||
int cur_y2 = start_y + tl_y2;
|
||||
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
|
||||
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
|
||||
uchar4 temp0,temp1;
|
||||
__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
|
||||
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
|
||||
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
|
||||
temp0 = *(__global uchar4*)&src[start_addr];
|
||||
temp1 = *(__global uchar4*)&src[start_addr2];
|
||||
//judge if read out of boundary
|
||||
temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
|
||||
temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
|
||||
temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
|
||||
temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
|
||||
temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
|
||||
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
|
||||
temp0 = *(__global uchar4*)&src[start_addr];
|
||||
temp1 = *(__global uchar4*)&src[start_addr2];
|
||||
//judge if read out of boundary
|
||||
temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
|
||||
temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
|
||||
temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
|
||||
temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
|
||||
temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
|
||||
|
||||
temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
|
||||
temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
|
||||
temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
|
||||
temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
|
||||
temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
|
||||
temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
|
||||
temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
|
||||
temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
|
||||
temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
|
||||
temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
|
||||
|
||||
LDS_DAT[point1] = temp0;
|
||||
LDS_DAT[point2] = temp1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uchar4 res = (uchar4)VAL;
|
||||
for(int i=0;i<2*RADIUSY+1;i++)
|
||||
for(int j=0;j<2*RADIUSX+1;j++)
|
||||
{
|
||||
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
|
||||
}
|
||||
int gidx = get_global_id(0)<<2;
|
||||
int gidy = get_global_id(1);
|
||||
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
|
||||
if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
|
||||
{
|
||||
*(__global uchar4*)&dst[out_addr] = res;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(gidx+3<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
dst[out_addr+2] = res.z;
|
||||
dst[out_addr+3] = res.w;
|
||||
}
|
||||
else if(gidx+2<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
dst[out_addr+2] = res.z;
|
||||
}
|
||||
else if(gidx+1<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
}
|
||||
else if(gidx<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
}
|
||||
}
|
||||
LDS_DAT[point1] = temp0;
|
||||
LDS_DAT[point2] = temp1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
uchar4 res = (uchar4)VAL;
|
||||
for(int i=0;i<2*RADIUSY+1;i++)
|
||||
for(int j=0;j<2*RADIUSX+1;j++)
|
||||
{
|
||||
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
|
||||
}
|
||||
int gidx = get_global_id(0)<<2;
|
||||
int gidy = get_global_id(1);
|
||||
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
|
||||
if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
|
||||
{
|
||||
*(__global uchar4*)&dst[out_addr] = res;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(gidx+3<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
dst[out_addr+2] = res.z;
|
||||
dst[out_addr+3] = res.w;
|
||||
}
|
||||
else if(gidx+2<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
dst[out_addr+2] = res.z;
|
||||
}
|
||||
else if(gidx+1<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
dst[out_addr+1] = res.y;
|
||||
}
|
||||
else if(gidx<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
__kernel void morph(__global const GENTYPE * restrict src,
|
||||
__global GENTYPE *dst,
|
||||
int src_offset_x, int src_offset_y,
|
||||
int cols, int rows,
|
||||
int src_step_in_pixel, int dst_step_in_pixel,
|
||||
__constant uchar * mat_kernel,
|
||||
int src_whole_cols, int src_whole_rows,
|
||||
int dst_offset_in_pixel)
|
||||
__global GENTYPE *dst,
|
||||
int src_offset_x, int src_offset_y,
|
||||
int cols, int rows,
|
||||
int src_step_in_pixel, int dst_step_in_pixel,
|
||||
__constant uchar * mat_kernel,
|
||||
int src_whole_cols, int src_whole_rows,
|
||||
int dst_offset_in_pixel)
|
||||
{
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int x = get_group_id(0)*LSIZE0;
|
||||
int y = get_group_id(1)*LSIZE1;
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int end_x = x + src_offset_x+LSIZE0+RADIUSX;
|
||||
int width = end_x -start_x+1;
|
||||
int start_y = y+src_offset_y-RADIUSY;
|
||||
int point1 = mad24(l_y,LSIZE0,l_x);
|
||||
int point2 = point1 + LSIZE0*LSIZE1;
|
||||
int tl_x = point1 % width;
|
||||
int tl_y = point1 / width;
|
||||
int tl_x2 = point2 % width;
|
||||
int tl_y2 = point2 / width;
|
||||
int cur_x = start_x + tl_x;
|
||||
int cur_y = start_y + tl_y;
|
||||
int cur_x2 = start_x + tl_x2;
|
||||
int cur_y2 = start_y + tl_y2;
|
||||
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
|
||||
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
|
||||
GENTYPE temp0,temp1;
|
||||
__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
|
||||
int l_x = get_local_id(0);
|
||||
int l_y = get_local_id(1);
|
||||
int x = get_group_id(0)*LSIZE0;
|
||||
int y = get_group_id(1)*LSIZE1;
|
||||
int start_x = x+src_offset_x-RADIUSX;
|
||||
int end_x = x + src_offset_x+LSIZE0+RADIUSX;
|
||||
int width = end_x -start_x+1;
|
||||
int start_y = y+src_offset_y-RADIUSY;
|
||||
int point1 = mad24(l_y,LSIZE0,l_x);
|
||||
int point2 = point1 + LSIZE0*LSIZE1;
|
||||
int tl_x = point1 % width;
|
||||
int tl_y = point1 / width;
|
||||
int tl_x2 = point2 % width;
|
||||
int tl_y2 = point2 / width;
|
||||
int cur_x = start_x + tl_x;
|
||||
int cur_y = start_y + tl_y;
|
||||
int cur_x2 = start_x + tl_x2;
|
||||
int cur_y2 = start_y + tl_y2;
|
||||
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
|
||||
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
|
||||
GENTYPE temp0,temp1;
|
||||
__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
|
||||
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
|
||||
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
|
||||
temp0 = src[start_addr];
|
||||
temp1 = src[start_addr2];
|
||||
//judge if read out of boundary
|
||||
temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
|
||||
temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
|
||||
//read pixels from src
|
||||
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
|
||||
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
|
||||
temp0 = src[start_addr];
|
||||
temp1 = src[start_addr2];
|
||||
//judge if read out of boundary
|
||||
temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
|
||||
temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
|
||||
|
||||
temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
|
||||
temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
|
||||
temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
|
||||
temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
|
||||
|
||||
LDS_DAT[point1] = temp0;
|
||||
LDS_DAT[point2] = temp1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
GENTYPE res = (GENTYPE)VAL;
|
||||
for(int i=0;i<2*RADIUSY+1;i++)
|
||||
for(int j=0;j<2*RADIUSX+1;j++)
|
||||
{
|
||||
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
|
||||
}
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
|
||||
if(gidx<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res;
|
||||
}
|
||||
LDS_DAT[point1] = temp0;
|
||||
LDS_DAT[point2] = temp1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
GENTYPE res = (GENTYPE)VAL;
|
||||
for(int i=0;i<2*RADIUSY+1;i++)
|
||||
for(int j=0;j<2*RADIUSX+1;j++)
|
||||
{
|
||||
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
|
||||
}
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
|
||||
if(gidx<cols && gidy<rows)
|
||||
{
|
||||
dst[out_addr] = res;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -46,365 +46,365 @@ typedef float sqsumtype;
|
||||
|
||||
typedef struct __attribute__((aligned (128))) GpuHidHaarFeature
|
||||
{
|
||||
struct __attribute__((aligned (32)))
|
||||
{
|
||||
int p0 __attribute__((aligned (4)));
|
||||
int p1 __attribute__((aligned (4)));
|
||||
int p2 __attribute__((aligned (4)));
|
||||
int p3 __attribute__((aligned (4)));
|
||||
float weight __attribute__((aligned (4)));
|
||||
}
|
||||
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
|
||||
struct __attribute__((aligned (32)))
|
||||
{
|
||||
int p0 __attribute__((aligned (4)));
|
||||
int p1 __attribute__((aligned (4)));
|
||||
int p2 __attribute__((aligned (4)));
|
||||
int p3 __attribute__((aligned (4)));
|
||||
float weight __attribute__((aligned (4)));
|
||||
}
|
||||
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
|
||||
}
|
||||
GpuHidHaarFeature;
|
||||
|
||||
|
||||
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
|
||||
{
|
||||
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
|
||||
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
|
||||
float threshold /*__attribute__((aligned (4)))*/;
|
||||
float alpha[2] __attribute__((aligned (8)));
|
||||
int left __attribute__((aligned (4)));
|
||||
int right __attribute__((aligned (4)));
|
||||
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
|
||||
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
|
||||
float threshold /*__attribute__((aligned (4)))*/;
|
||||
float alpha[2] __attribute__((aligned (8)));
|
||||
int left __attribute__((aligned (4)));
|
||||
int right __attribute__((aligned (4)));
|
||||
}
|
||||
GpuHidHaarTreeNode;
|
||||
|
||||
|
||||
typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
|
||||
{
|
||||
int count __attribute__((aligned (4)));
|
||||
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
|
||||
float* alpha __attribute__((aligned (8)));
|
||||
int count __attribute__((aligned (4)));
|
||||
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
|
||||
float* alpha __attribute__((aligned (8)));
|
||||
}
|
||||
GpuHidHaarClassifier;
|
||||
|
||||
|
||||
typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
|
||||
{
|
||||
int count __attribute__((aligned (4)));
|
||||
float threshold __attribute__((aligned (4)));
|
||||
int two_rects __attribute__((aligned (4)));
|
||||
int reserved0 __attribute__((aligned (8)));
|
||||
int reserved1 __attribute__((aligned (8)));
|
||||
int reserved2 __attribute__((aligned (8)));
|
||||
int reserved3 __attribute__((aligned (8)));
|
||||
int count __attribute__((aligned (4)));
|
||||
float threshold __attribute__((aligned (4)));
|
||||
int two_rects __attribute__((aligned (4)));
|
||||
int reserved0 __attribute__((aligned (8)));
|
||||
int reserved1 __attribute__((aligned (8)));
|
||||
int reserved2 __attribute__((aligned (8)));
|
||||
int reserved3 __attribute__((aligned (8)));
|
||||
}
|
||||
GpuHidHaarStageClassifier;
|
||||
|
||||
|
||||
typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
|
||||
{
|
||||
int count __attribute__((aligned (4)));
|
||||
int is_stump_based __attribute__((aligned (4)));
|
||||
int has_tilted_features __attribute__((aligned (4)));
|
||||
int is_tree __attribute__((aligned (4)));
|
||||
int pq0 __attribute__((aligned (4)));
|
||||
int pq1 __attribute__((aligned (4)));
|
||||
int pq2 __attribute__((aligned (4)));
|
||||
int pq3 __attribute__((aligned (4)));
|
||||
int p0 __attribute__((aligned (4)));
|
||||
int p1 __attribute__((aligned (4)));
|
||||
int p2 __attribute__((aligned (4)));
|
||||
int p3 __attribute__((aligned (4)));
|
||||
float inv_window_area __attribute__((aligned (4)));
|
||||
int count __attribute__((aligned (4)));
|
||||
int is_stump_based __attribute__((aligned (4)));
|
||||
int has_tilted_features __attribute__((aligned (4)));
|
||||
int is_tree __attribute__((aligned (4)));
|
||||
int pq0 __attribute__((aligned (4)));
|
||||
int pq1 __attribute__((aligned (4)));
|
||||
int pq2 __attribute__((aligned (4)));
|
||||
int pq3 __attribute__((aligned (4)));
|
||||
int p0 __attribute__((aligned (4)));
|
||||
int p1 __attribute__((aligned (4)));
|
||||
int p2 __attribute__((aligned (4)));
|
||||
int p3 __attribute__((aligned (4)));
|
||||
float inv_window_area __attribute__((aligned (4)));
|
||||
}GpuHidHaarClassifierCascade;
|
||||
|
||||
|
||||
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade,
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global int4 * info,
|
||||
global GpuHidHaarTreeNode * nodeptr,
|
||||
global const int * restrict sum1,
|
||||
global const float * restrict sqsum1,
|
||||
global int4 * candidate,
|
||||
const int pixelstep,
|
||||
const int loopcount,
|
||||
const int start_stage,
|
||||
const int split_stage,
|
||||
const int end_stage,
|
||||
const int startnode,
|
||||
const int splitnode,
|
||||
const int4 p,
|
||||
const int4 pq,
|
||||
const float correction
|
||||
//const int width,
|
||||
//const int height,
|
||||
//const int grpnumperline,
|
||||
//const int totalgrp
|
||||
)
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global int4 * info,
|
||||
global GpuHidHaarTreeNode * nodeptr,
|
||||
global const int * restrict sum1,
|
||||
global const float * restrict sqsum1,
|
||||
global int4 * candidate,
|
||||
const int pixelstep,
|
||||
const int loopcount,
|
||||
const int start_stage,
|
||||
const int split_stage,
|
||||
const int end_stage,
|
||||
const int startnode,
|
||||
const int splitnode,
|
||||
const int4 p,
|
||||
const int4 pq,
|
||||
const float correction
|
||||
//const int width,
|
||||
//const int height,
|
||||
//const int grpnumperline,
|
||||
//const int totalgrp
|
||||
)
|
||||
{
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
int grpnumx = get_num_groups(0);
|
||||
int grpidx = get_group_id(0);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
int grpnumx = get_num_groups(0);
|
||||
int grpidx = get_group_id(0);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
|
||||
int lcl_sz = mul24(grpszx,grpszy);
|
||||
int lcl_id = mad24(lclidy,grpszx,lclidx);
|
||||
int lcl_sz = mul24(grpszx,grpszy);
|
||||
int lcl_id = mad24(lclidy,grpszx,lclidx);
|
||||
|
||||
//assume lcl_sz == 256 or 128 or 64
|
||||
//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
|
||||
//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
|
||||
__local int lclshare[1024];
|
||||
//assume lcl_sz == 256 or 128 or 64
|
||||
//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
|
||||
//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
|
||||
__local int lclshare[1024];
|
||||
|
||||
#define OFF 0
|
||||
__local int* lcldata = lclshare + OFF;//for save win data
|
||||
__local int* glboutindex = lcldata + 28*28;//for save global out index
|
||||
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
|
||||
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
|
||||
__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
|
||||
glboutindex[0]=0;
|
||||
int outputoff = mul24(grpidx,256);
|
||||
__local int* lcldata = lclshare + OFF;//for save win data
|
||||
__local int* glboutindex = lcldata + 28*28;//for save global out index
|
||||
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
|
||||
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
|
||||
__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
|
||||
glboutindex[0]=0;
|
||||
int outputoff = mul24(grpidx,256);
|
||||
|
||||
//assume window size is 20X20
|
||||
//assume window size is 20X20
|
||||
#define WINDOWSIZE 20+1
|
||||
//make sure readwidth is the multiple of 4
|
||||
//ystep =1, from host code
|
||||
int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
|
||||
int readheight = grpszy-1+WINDOWSIZE;
|
||||
int read_horiz_cnt = readwidth >> 2;//each read int4
|
||||
int total_read = mul24(read_horiz_cnt,readheight);
|
||||
int read_loop = (total_read + lcl_sz - 1) >> 6;
|
||||
candidate[outputoff+(lcl_id<<2)] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
|
||||
for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
{
|
||||
int4 scaleinfo1= info[scalei];
|
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16;
|
||||
int height = scaleinfo1.x & 0xffff;
|
||||
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
int imgoff = scaleinfo1.z;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
//int ystep =1;// factor > 2.0 ? 1 : 2;
|
||||
//make sure readwidth is the multiple of 4
|
||||
//ystep =1, from host code
|
||||
int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
|
||||
int readheight = grpszy-1+WINDOWSIZE;
|
||||
int read_horiz_cnt = readwidth >> 2;//each read int4
|
||||
int total_read = mul24(read_horiz_cnt,readheight);
|
||||
int read_loop = (total_read + lcl_sz - 1) >> 6;
|
||||
candidate[outputoff+(lcl_id<<2)] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
|
||||
for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
{
|
||||
int4 scaleinfo1= info[scalei];
|
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16;
|
||||
int height = scaleinfo1.x & 0xffff;
|
||||
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
int imgoff = scaleinfo1.z;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
//int ystep =1;// factor > 2.0 ? 1 : 2;
|
||||
|
||||
__global const int * sum = sum1 + imgoff;
|
||||
__global const float * sqsum = sqsum1 + imgoff;
|
||||
for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
|
||||
{
|
||||
int grpidy = grploop / grpnumperline;
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int x = mad24(grpidx,grpszx,lclidx);
|
||||
int y = mad24(grpidy,grpszy,lclidy);
|
||||
//candidate_result.x = convert_int_rtn(x*factor);
|
||||
//candidate_result.y = convert_int_rtn(y*factor);
|
||||
int grpoffx = x-lclidx;
|
||||
int grpoffy = y-lclidy;
|
||||
__global const int * sum = sum1 + imgoff;
|
||||
__global const float * sqsum = sqsum1 + imgoff;
|
||||
for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
|
||||
{
|
||||
int grpidy = grploop / grpnumperline;
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int x = mad24(grpidx,grpszx,lclidx);
|
||||
int y = mad24(grpidy,grpszy,lclidy);
|
||||
//candidate_result.x = convert_int_rtn(x*factor);
|
||||
//candidate_result.y = convert_int_rtn(y*factor);
|
||||
int grpoffx = x-lclidx;
|
||||
int grpoffy = y-lclidy;
|
||||
|
||||
for(int i=0;i<read_loop;i++)
|
||||
{
|
||||
int pos_id = mad24(i,lcl_sz,lcl_id);
|
||||
pos_id = pos_id < total_read ? pos_id : 0;
|
||||
for(int i=0;i<read_loop;i++)
|
||||
{
|
||||
int pos_id = mad24(i,lcl_sz,lcl_id);
|
||||
pos_id = pos_id < total_read ? pos_id : 0;
|
||||
|
||||
int lcl_y = pos_id / read_horiz_cnt;
|
||||
int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
|
||||
int lcl_y = pos_id / read_horiz_cnt;
|
||||
int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
|
||||
|
||||
int glb_x = grpoffx + (lcl_x<<2);
|
||||
int glb_y = grpoffy + lcl_y;
|
||||
int glb_x = grpoffx + (lcl_x<<2);
|
||||
int glb_y = grpoffy + lcl_y;
|
||||
|
||||
int glb_off = mad24(glb_y,pixelstep,glb_x);
|
||||
int4 data = *(__global int4*)&sum[glb_off];
|
||||
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
|
||||
int glb_off = mad24(glb_y,pixelstep,glb_x);
|
||||
int4 data = *(__global int4*)&sum[glb_off];
|
||||
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
|
||||
|
||||
lcldata[lcl_off] = data.x;
|
||||
lcldata[lcl_off+1] = data.y;
|
||||
lcldata[lcl_off+2] = data.z;
|
||||
lcldata[lcl_off+3] = data.w;
|
||||
}
|
||||
lcldata[lcl_off] = data.x;
|
||||
lcldata[lcl_off+1] = data.y;
|
||||
lcldata[lcl_off+2] = data.z;
|
||||
lcldata[lcl_off+3] = data.w;
|
||||
}
|
||||
|
||||
lcloutindex[lcl_id] = 0;
|
||||
lclcount[0] = 0;
|
||||
int result = 1;
|
||||
int nodecounter= startnode;
|
||||
float mean, variance_norm_factor;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
lcloutindex[lcl_id] = 0;
|
||||
lclcount[0] = 0;
|
||||
int result = 1;
|
||||
int nodecounter= startnode;
|
||||
float mean, variance_norm_factor;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int lcl_off = mad24(lclidy,readwidth,lclidx);
|
||||
int4 cascadeinfo1, cascadeinfo2;
|
||||
cascadeinfo1 = p;
|
||||
cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
|
||||
int lcl_off = mad24(lclidy,readwidth,lclidx);
|
||||
int4 cascadeinfo1, cascadeinfo2;
|
||||
cascadeinfo1 = p;
|
||||
cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
|
||||
|
||||
|
||||
//if((x < width) && (y < height))
|
||||
{
|
||||
cascadeinfo1.x +=lcl_off;
|
||||
cascadeinfo1.z +=lcl_off;
|
||||
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
|
||||
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
|
||||
*correction;
|
||||
//if((x < width) && (y < height))
|
||||
{
|
||||
cascadeinfo1.x +=lcl_off;
|
||||
cascadeinfo1.z +=lcl_off;
|
||||
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
|
||||
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
|
||||
*correction;
|
||||
|
||||
int p_offset = mad24(y, pixelstep, x);
|
||||
int p_offset = mad24(y, pixelstep, x);
|
||||
|
||||
cascadeinfo2.x +=p_offset;
|
||||
cascadeinfo2.z +=p_offset;
|
||||
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
|
||||
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
|
||||
cascadeinfo2.x +=p_offset;
|
||||
cascadeinfo2.z +=p_offset;
|
||||
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
|
||||
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
|
||||
|
||||
variance_norm_factor = variance_norm_factor * correction - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
//if( cascade->is_stump_based )
|
||||
//{
|
||||
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
|
||||
variance_norm_factor = variance_norm_factor * correction - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
//if( cascade->is_stump_based )
|
||||
//{
|
||||
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
|
||||
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
|
||||
info1.x +=lcl_off;
|
||||
info1.z +=lcl_off;
|
||||
info2.x +=lcl_off;
|
||||
info2.z +=lcl_off;
|
||||
info1.x +=lcl_off;
|
||||
info1.z +=lcl_off;
|
||||
info2.x +=lcl_off;
|
||||
info2.z +=lcl_off;
|
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
|
||||
|
||||
//if((info3.z - info3.x) && (!stageinfo.z))
|
||||
//{
|
||||
info3.x +=lcl_off;
|
||||
info3.z +=lcl_off;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
//}
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
//if((info3.z - info3.x) && (!stageinfo.z))
|
||||
//{
|
||||
info3.x +=lcl_off;
|
||||
info3.z +=lcl_off;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
//}
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
|
||||
if(result && (x < width) && (y < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
nodecounter = splitnode;
|
||||
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
|
||||
{
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//if(lcl_id == 0)
|
||||
if(result && (x < width) && (y < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
nodecounter = splitnode;
|
||||
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
|
||||
{
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//if(lcl_id == 0)
|
||||
lclcount[0]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
|
||||
int perfscale = queuecount > 4 ? 3 : 2;
|
||||
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
|
||||
int lcl_compute_win = lcl_sz >> perfscale;
|
||||
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
|
||||
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
|
||||
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
|
||||
for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
|
||||
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
|
||||
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
|
||||
int perfscale = queuecount > 4 ? 3 : 2;
|
||||
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
|
||||
int lcl_compute_win = lcl_sz >> perfscale;
|
||||
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
|
||||
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
|
||||
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
|
||||
for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
|
||||
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
|
||||
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
|
||||
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_compute_win_id < queuecount) {
|
||||
|
||||
|
||||
int tempnodecounter = lcl_compute_id;
|
||||
float part_sum = 0.f;
|
||||
for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
|
||||
float part_sum = 0.f;
|
||||
for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
|
||||
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
|
||||
info1.x +=queue_pixel;
|
||||
info1.z +=queue_pixel;
|
||||
info2.x +=queue_pixel;
|
||||
info2.z +=queue_pixel;
|
||||
info1.x +=queue_pixel;
|
||||
info1.z +=queue_pixel;
|
||||
info2.x +=queue_pixel;
|
||||
info2.z +=queue_pixel;
|
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
//if((info3.z - info3.x) && (!stageinfo.z))
|
||||
//{
|
||||
info3.x +=queue_pixel;
|
||||
info3.z +=queue_pixel;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
//}
|
||||
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
tempnodecounter +=lcl_compute_win;
|
||||
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
|
||||
partialsum[lcl_id]=part_sum;
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
//if((info3.z - info3.x) && (!stageinfo.z))
|
||||
//{
|
||||
info3.x +=queue_pixel;
|
||||
info3.z +=queue_pixel;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
//}
|
||||
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
tempnodecounter +=lcl_compute_win;
|
||||
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
|
||||
partialsum[lcl_id]=part_sum;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_compute_win_id < queuecount) {
|
||||
for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
|
||||
{
|
||||
stage_sum += partialsum[lcl_id+i];
|
||||
}
|
||||
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = temp_coord;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
lcl_compute_win_id +=(1<<perfscale);
|
||||
for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
|
||||
{
|
||||
stage_sum += partialsum[lcl_id+i];
|
||||
}
|
||||
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = temp_coord;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
lcl_compute_win_id +=(1<<perfscale);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
queuecount = lclcount[0];
|
||||
nodecounter += stageinfo.x;
|
||||
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_id<queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id<<1];
|
||||
int x = mad24(grpidx,grpszx,temp & 0xffff);
|
||||
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
|
||||
candidate_result.x = convert_int_rtn(x*factor);
|
||||
candidate_result.y = convert_int_rtn(y*factor);
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff+temp+lcl_id] = candidate_result;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end if((x < width) && (y < height))
|
||||
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
|
||||
//outputoff +=mul24(width,height);
|
||||
}//end for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
queuecount = lclcount[0];
|
||||
nodecounter += stageinfo.x;
|
||||
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_id<queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id<<1];
|
||||
int x = mad24(grpidx,grpszx,temp & 0xffff);
|
||||
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
|
||||
candidate_result.x = convert_int_rtn(x*factor);
|
||||
candidate_result.y = convert_int_rtn(y*factor);
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff+temp+lcl_id] = candidate_result;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end if((x < width) && (y < height))
|
||||
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
|
||||
//outputoff +=mul24(width,height);
|
||||
}//end for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -421,7 +421,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
|
||||
|
||||
/*
|
||||
if(stagecascade->two_rects)
|
||||
if(stagecascade->two_rects)
|
||||
{
|
||||
#pragma unroll
|
||||
for( n = 0; n < stagecascade->count; n++ )
|
||||
@@ -429,10 +429,10 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
t1 = *(node + counter);
|
||||
t = t1.threshold * variance_norm_factor;
|
||||
classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
|
||||
|
||||
|
||||
classsum += calc_sum1(t1, p_offset,1) * t1.weight[1];
|
||||
stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
|
||||
|
||||
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
@@ -444,75 +444,75 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
t = node[counter].threshold*variance_norm_factor;
|
||||
classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
|
||||
classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
|
||||
|
||||
|
||||
if( node[counter].p0[2] )
|
||||
classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
|
||||
|
||||
|
||||
stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
|
||||
|
||||
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
*/
|
||||
/*
|
||||
/*
|
||||
__kernel void gpuRunHaarClassifierCascade_ScaleWindow(
|
||||
constant GpuHidHaarClassifierCascade * _cascade,
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
//global GpuHidHaarClassifier * classifierptr,
|
||||
global GpuHidHaarTreeNode * nodeptr,
|
||||
global int * sum,
|
||||
global float * sqsum,
|
||||
global int * _candidate,
|
||||
constant GpuHidHaarClassifierCascade * _cascade,
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
//global GpuHidHaarClassifier * classifierptr,
|
||||
global GpuHidHaarTreeNode * nodeptr,
|
||||
global int * sum,
|
||||
global float * sqsum,
|
||||
global int * _candidate,
|
||||
int pixel_step,
|
||||
int cols,
|
||||
int rows,
|
||||
int start_stage,
|
||||
int end_stage,
|
||||
int cols,
|
||||
int rows,
|
||||
int start_stage,
|
||||
int end_stage,
|
||||
//int counts,
|
||||
int nodenum,
|
||||
int ystep,
|
||||
int detect_width,
|
||||
//int detect_height,
|
||||
int loopcount,
|
||||
int outputstep)
|
||||
//float scalefactor)
|
||||
int nodenum,
|
||||
int ystep,
|
||||
int detect_width,
|
||||
//int detect_height,
|
||||
int loopcount,
|
||||
int outputstep)
|
||||
//float scalefactor)
|
||||
{
|
||||
unsigned int x1 = get_global_id(0);
|
||||
unsigned int y1 = get_global_id(1);
|
||||
int p_offset;
|
||||
int m, n;
|
||||
int result;
|
||||
int counter;
|
||||
float mean, variance_norm_factor;
|
||||
for(int i=0;i<loopcount;i++)
|
||||
{
|
||||
constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
|
||||
global int * candidate = _candidate + i*outputstep;
|
||||
int window_width = cascade->p1 - cascade->p0;
|
||||
int window_height = window_width;
|
||||
result = 1;
|
||||
counter = 0;
|
||||
unsigned int x = mul24(x1,ystep);
|
||||
unsigned int y = mul24(y1,ystep);
|
||||
if((x < cols - window_width - 1) && (y < rows - window_height -1))
|
||||
{
|
||||
global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
|
||||
//global GpuHidHaarClassifier *classifier = classifierptr;
|
||||
global GpuHidHaarTreeNode *node = nodeptr + nodenum*i;
|
||||
unsigned int x1 = get_global_id(0);
|
||||
unsigned int y1 = get_global_id(1);
|
||||
int p_offset;
|
||||
int m, n;
|
||||
int result;
|
||||
int counter;
|
||||
float mean, variance_norm_factor;
|
||||
for(int i=0;i<loopcount;i++)
|
||||
{
|
||||
constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
|
||||
global int * candidate = _candidate + i*outputstep;
|
||||
int window_width = cascade->p1 - cascade->p0;
|
||||
int window_height = window_width;
|
||||
result = 1;
|
||||
counter = 0;
|
||||
unsigned int x = mul24(x1,ystep);
|
||||
unsigned int y = mul24(y1,ystep);
|
||||
if((x < cols - window_width - 1) && (y < rows - window_height -1))
|
||||
{
|
||||
global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
|
||||
//global GpuHidHaarClassifier *classifier = classifierptr;
|
||||
global GpuHidHaarTreeNode *node = nodeptr + nodenum*i;
|
||||
|
||||
p_offset = mad24(y, pixel_step, x);// modify
|
||||
p_offset = mad24(y, pixel_step, x);// modify
|
||||
|
||||
mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
|
||||
*(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
|
||||
*cascade->inv_window_area;
|
||||
mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
|
||||
*(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
|
||||
*cascade->inv_window_area;
|
||||
|
||||
variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
|
||||
*(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
|
||||
variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
|
||||
variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
|
||||
*(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
|
||||
variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
|
||||
|
||||
// if( cascade->is_stump_based )
|
||||
//{
|
||||
// if( cascade->is_stump_based )
|
||||
//{
|
||||
for( m = start_stage; m < end_stage; m++ )
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
@@ -532,29 +532,29 @@ __kernel void gpuRunHaarClassifierCascade_ScaleWindow(
|
||||
stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
|
||||
counter++;
|
||||
}
|
||||
|
||||
|
||||
if (stage_sum < stagecascade->threshold)
|
||||
{
|
||||
result = 0;
|
||||
break;
|
||||
result = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
stagecascade++;
|
||||
|
||||
}
|
||||
if(result)
|
||||
{
|
||||
candidate[4 * (y1 * detect_width + x1)] = x;
|
||||
candidate[4 * (y1 * detect_width + x1) + 1] = y;
|
||||
candidate[4 * (y1 * detect_width + x1)+2] = window_width;
|
||||
candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
|
||||
}
|
||||
//}
|
||||
}
|
||||
}
|
||||
if(result)
|
||||
{
|
||||
candidate[4 * (y1 * detect_width + x1)] = x;
|
||||
candidate[4 * (y1 * detect_width + x1) + 1] = y;
|
||||
candidate[4 * (y1 * detect_width + x1)+2] = window_width;
|
||||
candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
|
||||
}
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -50,89 +50,89 @@ typedef int sumtype;
|
||||
typedef float sqsumtype;
|
||||
typedef struct __attribute__((aligned (128))) GpuHidHaarFeature
|
||||
{
|
||||
struct __attribute__((aligned (32)))
|
||||
{
|
||||
int p0 __attribute__((aligned (4)));
|
||||
int p1 __attribute__((aligned (4)));
|
||||
int p2 __attribute__((aligned (4)));
|
||||
int p3 __attribute__((aligned (4)));
|
||||
float weight __attribute__((aligned (4)));
|
||||
}
|
||||
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
|
||||
struct __attribute__((aligned (32)))
|
||||
{
|
||||
int p0 __attribute__((aligned (4)));
|
||||
int p1 __attribute__((aligned (4)));
|
||||
int p2 __attribute__((aligned (4)));
|
||||
int p3 __attribute__((aligned (4)));
|
||||
float weight __attribute__((aligned (4)));
|
||||
}
|
||||
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
|
||||
}
|
||||
GpuHidHaarFeature;
|
||||
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
|
||||
{
|
||||
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
|
||||
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
|
||||
float threshold /*__attribute__((aligned (4)))*/;
|
||||
float alpha[2] __attribute__((aligned (8)));
|
||||
int left __attribute__((aligned (4)));
|
||||
int right __attribute__((aligned (4)));
|
||||
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
|
||||
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
|
||||
float threshold /*__attribute__((aligned (4)))*/;
|
||||
float alpha[2] __attribute__((aligned (8)));
|
||||
int left __attribute__((aligned (4)));
|
||||
int right __attribute__((aligned (4)));
|
||||
}
|
||||
GpuHidHaarTreeNode;
|
||||
typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
|
||||
{
|
||||
int count __attribute__((aligned (4)));
|
||||
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
|
||||
float* alpha __attribute__((aligned (8)));
|
||||
int count __attribute__((aligned (4)));
|
||||
GpuHidHaarTreeNode* node __attribute__((aligned (8)));
|
||||
float* alpha __attribute__((aligned (8)));
|
||||
}
|
||||
GpuHidHaarClassifier;
|
||||
typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
|
||||
{
|
||||
int count __attribute__((aligned (4)));
|
||||
float threshold __attribute__((aligned (4)));
|
||||
int two_rects __attribute__((aligned (4)));
|
||||
int reserved0 __attribute__((aligned (8)));
|
||||
int reserved1 __attribute__((aligned (8)));
|
||||
int reserved2 __attribute__((aligned (8)));
|
||||
int reserved3 __attribute__((aligned (8)));
|
||||
int count __attribute__((aligned (4)));
|
||||
float threshold __attribute__((aligned (4)));
|
||||
int two_rects __attribute__((aligned (4)));
|
||||
int reserved0 __attribute__((aligned (8)));
|
||||
int reserved1 __attribute__((aligned (8)));
|
||||
int reserved2 __attribute__((aligned (8)));
|
||||
int reserved3 __attribute__((aligned (8)));
|
||||
}
|
||||
GpuHidHaarStageClassifier;
|
||||
typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
|
||||
{
|
||||
int count __attribute__((aligned (4)));
|
||||
int is_stump_based __attribute__((aligned (4)));
|
||||
int has_tilted_features __attribute__((aligned (4)));
|
||||
int is_tree __attribute__((aligned (4)));
|
||||
int pq0 __attribute__((aligned (4)));
|
||||
int pq1 __attribute__((aligned (4)));
|
||||
int pq2 __attribute__((aligned (4)));
|
||||
int pq3 __attribute__((aligned (4)));
|
||||
int p0 __attribute__((aligned (4)));
|
||||
int p1 __attribute__((aligned (4)));
|
||||
int p2 __attribute__((aligned (4)));
|
||||
int p3 __attribute__((aligned (4)));
|
||||
float inv_window_area __attribute__((aligned (4)));
|
||||
int count __attribute__((aligned (4)));
|
||||
int is_stump_based __attribute__((aligned (4)));
|
||||
int has_tilted_features __attribute__((aligned (4)));
|
||||
int is_tree __attribute__((aligned (4)));
|
||||
int pq0 __attribute__((aligned (4)));
|
||||
int pq1 __attribute__((aligned (4)));
|
||||
int pq2 __attribute__((aligned (4)));
|
||||
int pq3 __attribute__((aligned (4)));
|
||||
int p0 __attribute__((aligned (4)));
|
||||
int p1 __attribute__((aligned (4)));
|
||||
int p2 __attribute__((aligned (4)));
|
||||
int p3 __attribute__((aligned (4)));
|
||||
float inv_window_area __attribute__((aligned (4)));
|
||||
}GpuHidHaarClassifierCascade;
|
||||
|
||||
|
||||
__kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global int4 * info,
|
||||
global GpuHidHaarTreeNode * nodeptr,
|
||||
global const int * restrict sum,
|
||||
global const float * restrict sqsum,
|
||||
global int4 * candidate,
|
||||
const int step,
|
||||
const int loopcount,
|
||||
const int start_stage,
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global int4 * info,
|
||||
global GpuHidHaarTreeNode * nodeptr,
|
||||
global const int * restrict sum,
|
||||
global const float * restrict sqsum,
|
||||
global int4 * candidate,
|
||||
const int step,
|
||||
const int loopcount,
|
||||
const int start_stage,
|
||||
const int split_stage,
|
||||
const int end_stage,
|
||||
const int startnode,
|
||||
const int end_stage,
|
||||
const int startnode,
|
||||
const int splitnode,
|
||||
global int4 * p,
|
||||
//const int4 * pq,
|
||||
global float * correction,
|
||||
//const int4 * pq,
|
||||
global float * correction,
|
||||
const int nodecount)
|
||||
{
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
int grpnumx = get_num_groups(0);
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
int grpnumx = get_num_groups(0);
|
||||
int grpidx=get_group_id(0);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
int lcl_sz = mul24(grpszx,grpszy);
|
||||
int lcl_id = mad24(lclidy,grpszx,lclidx);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
int lcl_sz = mul24(grpszx,grpszy);
|
||||
int lcl_id = mad24(lclidy,grpszx,lclidx);
|
||||
__local int lclshare[1024];
|
||||
__local int* glboutindex=lclshare+0;
|
||||
__local int* lclcount=glboutindex+1;
|
||||
@@ -140,85 +140,85 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
__local float* partialsum=(__local float*)(lcloutindex+(lcl_sz<<1));
|
||||
glboutindex[0]=0;
|
||||
int outputoff = mul24(grpidx,256);
|
||||
candidate[outputoff+(lcl_id<<2)] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
|
||||
for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
{
|
||||
int4 scaleinfo1;
|
||||
scaleinfo1 = info[scalei];
|
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16;
|
||||
int height = scaleinfo1.x & 0xffff;
|
||||
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
float correction_t=correction[scalei];
|
||||
int ystep=(int)(max(2.0f,factor)+0.5f);
|
||||
for(int grploop=get_group_id(0);grploop<totalgrp;grploop+=grpnumx){
|
||||
int4 cascadeinfo=p[scalei];
|
||||
int grpidy = grploop / grpnumperline;
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int ix = mad24(grpidx,grpszx,lclidx);
|
||||
int iy = mad24(grpidy,grpszy,lclidy);
|
||||
candidate[outputoff+(lcl_id<<2)] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
|
||||
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
|
||||
for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
{
|
||||
int4 scaleinfo1;
|
||||
scaleinfo1 = info[scalei];
|
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16;
|
||||
int height = scaleinfo1.x & 0xffff;
|
||||
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
float correction_t=correction[scalei];
|
||||
int ystep=(int)(max(2.0f,factor)+0.5f);
|
||||
for(int grploop=get_group_id(0);grploop<totalgrp;grploop+=grpnumx){
|
||||
int4 cascadeinfo=p[scalei];
|
||||
int grpidy = grploop / grpnumperline;
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int ix = mad24(grpidx,grpszx,lclidx);
|
||||
int iy = mad24(grpidy,grpszy,lclidy);
|
||||
int x=ix*ystep;
|
||||
int y=iy*ystep;
|
||||
lcloutindex[lcl_id]=0;
|
||||
lclcount[0]=0;
|
||||
int result=1,nodecounter;
|
||||
float mean,variance_norm_factor;
|
||||
//if((ix < width) && (iy < height))
|
||||
int result=1,nodecounter;
|
||||
float mean,variance_norm_factor;
|
||||
//if((ix < width) && (iy < height))
|
||||
{
|
||||
const int p_offset = mad24(y, step, x);
|
||||
cascadeinfo.x +=p_offset;
|
||||
cascadeinfo.z +=p_offset;
|
||||
mean = (sum[mad24(cascadeinfo.y,step,cascadeinfo.x)] - sum[mad24(cascadeinfo.y,step,cascadeinfo.z)] -
|
||||
sum[mad24(cascadeinfo.w,step,cascadeinfo.x)] + sum[mad24(cascadeinfo.w,step,cascadeinfo.z)])
|
||||
*correction_t;
|
||||
variance_norm_factor =sqsum[mad24(cascadeinfo.y,step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
|
||||
sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
|
||||
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
result = 1;
|
||||
nodecounter = startnode+nodecount*scalei;
|
||||
for(int stageloop = start_stage; stageloop < split_stage&&result; stageloop++ )
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int4 stageinfo = *(global int4*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
const int p_offset = mad24(y, step, x);
|
||||
cascadeinfo.x +=p_offset;
|
||||
cascadeinfo.z +=p_offset;
|
||||
mean = (sum[mad24(cascadeinfo.y,step,cascadeinfo.x)] - sum[mad24(cascadeinfo.y,step,cascadeinfo.z)] -
|
||||
sum[mad24(cascadeinfo.w,step,cascadeinfo.x)] + sum[mad24(cascadeinfo.w,step,cascadeinfo.z)])
|
||||
*correction_t;
|
||||
variance_norm_factor =sqsum[mad24(cascadeinfo.y,step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
|
||||
sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
|
||||
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
result = 1;
|
||||
nodecounter = startnode+nodecount*scalei;
|
||||
for(int stageloop = start_stage; stageloop < split_stage&&result; stageloop++ )
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int4 stageinfo = *(global int4*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
info1.x +=p_offset;
|
||||
info1.z +=p_offset;
|
||||
info2.x +=p_offset;
|
||||
info2.z +=p_offset;
|
||||
float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
|
||||
sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
|
||||
classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
|
||||
sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
|
||||
info3.x +=p_offset;
|
||||
info3.z +=p_offset;
|
||||
classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
|
||||
sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
result=(stage_sum>=stagethreshold);
|
||||
}
|
||||
if(result&&(ix<width)&&(iy<height))
|
||||
info1.x +=p_offset;
|
||||
info1.z +=p_offset;
|
||||
info2.x +=p_offset;
|
||||
info2.z +=p_offset;
|
||||
float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
|
||||
sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
|
||||
classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
|
||||
sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
|
||||
info3.x +=p_offset;
|
||||
info3.z +=p_offset;
|
||||
classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
|
||||
sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
result=(stage_sum>=stagethreshold);
|
||||
}
|
||||
if(result&&(ix<width)&&(iy<height))
|
||||
{
|
||||
int queueindex=atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1]=(y<<16)|x;
|
||||
lcloutindex[(queueindex<<1)+1]=as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount=lclcount[0];
|
||||
nodecounter=splitnode+nodecount*scalei;
|
||||
for(int stageloop=split_stage;stageloop<end_stage&&queuecount>0;stageloop++)
|
||||
@@ -244,34 +244,34 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
for(int lcl_loop=0;lcl_loop<lcl_loops&&tempnodecounter<stageinfo.x;lcl_loop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
info1.x +=queue_offset;
|
||||
info1.z +=queue_offset;
|
||||
info2.x +=queue_offset;
|
||||
info2.z +=queue_offset;
|
||||
float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
|
||||
sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
|
||||
classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
|
||||
sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
|
||||
|
||||
info3.x +=queue_offset;
|
||||
info3.z +=queue_offset;
|
||||
classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
|
||||
sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
|
||||
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
tempnodecounter+=lcl_compute_win;
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
info1.x +=queue_offset;
|
||||
info1.z +=queue_offset;
|
||||
info2.x +=queue_offset;
|
||||
info2.z +=queue_offset;
|
||||
float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] -
|
||||
sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
|
||||
classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] -
|
||||
sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
|
||||
|
||||
info3.x +=queue_offset;
|
||||
info3.z +=queue_offset;
|
||||
classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] -
|
||||
sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
|
||||
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
tempnodecounter+=lcl_compute_win;
|
||||
}
|
||||
partialsum[lcl_id]=part_sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i=0;i<lcl_compute_win&&(lcl_compute_id==0);i++)
|
||||
{
|
||||
stage_sum+=partialsum[lcl_id+i];
|
||||
}
|
||||
}
|
||||
if(stage_sum>=stagethreshold&&(lcl_compute_id==0))
|
||||
{
|
||||
int queueindex=atomic_inc(lclcount);
|
||||
@@ -298,8 +298,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
candidate[outputoff+temp+lcl_id]=candidate_result;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode * orinode, global GpuHidHaarTreeNode * newnode,float scale,float weight_scale,int nodenum)
|
||||
|
||||
@@ -33,106 +33,106 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
|
||||
__kernel void bilateral_C1_D0(__global uchar *dst,
|
||||
__global const uchar *src,
|
||||
const int dst_rows,
|
||||
const int dst_cols,
|
||||
const int maxk,
|
||||
const int radius,
|
||||
const int dst_step,
|
||||
const int dst_offset,
|
||||
const int src_step,
|
||||
const int src_rows,
|
||||
const int src_cols,
|
||||
__constant float *color_weight,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
float sum = 0.f, wsum = 0.f;
|
||||
__global const uchar *src,
|
||||
const int dst_rows,
|
||||
const int dst_cols,
|
||||
const int maxk,
|
||||
const int radius,
|
||||
const int dst_step,
|
||||
const int dst_offset,
|
||||
const int src_step,
|
||||
const int src_rows,
|
||||
const int src_cols,
|
||||
__constant float *color_weight,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
float sum = 0.f, wsum = 0.f;
|
||||
|
||||
int val0 = (int)src[src_addr];
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int val = (int)src[src_addr + space_ofs[k]];
|
||||
float w = space_weight[k]*color_weight[abs(val - val0)];
|
||||
sum += (float)(val)*w;
|
||||
wsum += w;
|
||||
}
|
||||
dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
|
||||
}
|
||||
int val0 = (int)src[src_addr];
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int val = (int)src[src_addr + space_ofs[k]];
|
||||
float w = space_weight[k]*color_weight[abs(val - val0)];
|
||||
sum += (float)(val)*w;
|
||||
wsum += w;
|
||||
}
|
||||
dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
|
||||
}
|
||||
}
|
||||
__kernel void bilateral2_C1_D0(__global uchar *dst,
|
||||
__global const uchar *src,
|
||||
const int dst_rows,
|
||||
const int dst_cols,
|
||||
const int maxk,
|
||||
const int radius,
|
||||
const int dst_step,
|
||||
const int dst_offset,
|
||||
const int src_step,
|
||||
const int src_rows,
|
||||
const int src_cols,
|
||||
__constant float *color_weight,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0)<<2;
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
float4 sum = (float4)(0.f), wsum = (float4)(0.f);
|
||||
__global const uchar *src,
|
||||
const int dst_rows,
|
||||
const int dst_cols,
|
||||
const int maxk,
|
||||
const int radius,
|
||||
const int dst_step,
|
||||
const int dst_offset,
|
||||
const int src_step,
|
||||
const int src_rows,
|
||||
const int src_cols,
|
||||
__constant float *color_weight,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0)<<2;
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
float4 sum = (float4)(0.f), wsum = (float4)(0.f);
|
||||
|
||||
int4 val0 = convert_int4(vload4(0,src+src_addr));
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
|
||||
float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
|
||||
sum += convert_float4(val)*w;
|
||||
wsum += w;
|
||||
}
|
||||
*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
|
||||
}
|
||||
int4 val0 = convert_int4(vload4(0,src+src_addr));
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
|
||||
float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
|
||||
sum += convert_float4(val)*w;
|
||||
wsum += w;
|
||||
}
|
||||
*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
|
||||
}
|
||||
}
|
||||
__kernel void bilateral_C4_D0(__global uchar4 *dst,
|
||||
__global const uchar4 *src,
|
||||
const int dst_rows,
|
||||
const int dst_cols,
|
||||
const int maxk,
|
||||
const int radius,
|
||||
const int dst_step,
|
||||
const int dst_offset,
|
||||
const int src_step,
|
||||
const int src_rows,
|
||||
const int src_cols,
|
||||
__constant float *color_weight,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
float4 sum = (float4)0.f;
|
||||
float wsum = 0.f;
|
||||
__global const uchar4 *src,
|
||||
const int dst_rows,
|
||||
const int dst_cols,
|
||||
const int maxk,
|
||||
const int radius,
|
||||
const int dst_step,
|
||||
const int dst_offset,
|
||||
const int src_step,
|
||||
const int src_rows,
|
||||
const int src_cols,
|
||||
__constant float *color_weight,
|
||||
__constant float *space_weight,
|
||||
__constant int *space_ofs)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
if((gidy<dst_rows) && (gidx<dst_cols))
|
||||
{
|
||||
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
|
||||
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
|
||||
float4 sum = (float4)0.f;
|
||||
float wsum = 0.f;
|
||||
|
||||
int4 val0 = convert_int4(src[src_addr]);
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int4 val = convert_int4(src[src_addr + space_ofs[k]]);
|
||||
float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
|
||||
sum += convert_float4(val)*(float4)w;
|
||||
wsum += w;
|
||||
}
|
||||
wsum=1.f/wsum;
|
||||
dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
|
||||
}
|
||||
int4 val0 = convert_int4(src[src_addr]);
|
||||
for(int k = 0; k < maxk; k++ )
|
||||
{
|
||||
int4 val = convert_int4(src[src_addr + space_ofs[k]]);
|
||||
float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
|
||||
sum += convert_float4(val)*(float4)w;
|
||||
wsum += w;
|
||||
}
|
||||
wsum=1.f/wsum;
|
||||
dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,8 +53,8 @@
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
@@ -120,10 +120,10 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
|
||||
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
|
||||
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
|
||||
dx_data[i] = dx_con ? dx_s : 0.0;
|
||||
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
|
||||
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
|
||||
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
|
||||
dy_data[i] = dy_con ? dy_s : 0.0;
|
||||
data[0][i] = dx_data[i] * dx_data[i];
|
||||
data[1][i] = dx_data[i] * dy_data[i];
|
||||
@@ -139,7 +139,7 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
|
||||
dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
|
||||
dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
|
||||
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
|
||||
|
||||
|
||||
int dy_selected_row;
|
||||
int dy_selected_col;
|
||||
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
|
||||
@@ -147,7 +147,7 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
|
||||
dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
|
||||
dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
|
||||
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
|
||||
|
||||
|
||||
data[0][i] = dx_data[i] * dx_data[i];
|
||||
data[1][i] = dx_data[i] * dy_data[i];
|
||||
data[2][i] = dy_data[i] * dy_data[i];
|
||||
@@ -189,12 +189,12 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
|
||||
|
||||
if(posX < dst_cols && (posY) < dst_rows)
|
||||
{
|
||||
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
|
||||
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
|
||||
tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
|
||||
}
|
||||
if(posX < dst_cols && (posY + 1) < dst_rows)
|
||||
{
|
||||
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
|
||||
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
|
||||
tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,8 +53,8 @@
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
@@ -120,10 +120,10 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
|
||||
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
|
||||
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
|
||||
dx_data[i] = dx_con ? dx_s : 0.0;
|
||||
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
|
||||
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
|
||||
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
|
||||
dy_data[i] = dy_con ? dy_s : 0.0;
|
||||
data[0][i] = dx_data[i] * dx_data[i];
|
||||
data[1][i] = dx_data[i] * dy_data[i];
|
||||
@@ -139,7 +139,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
|
||||
dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
|
||||
dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
|
||||
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
|
||||
|
||||
|
||||
int dy_selected_row;
|
||||
int dy_selected_col;
|
||||
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
|
||||
@@ -147,7 +147,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
|
||||
dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
|
||||
dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
|
||||
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
|
||||
|
||||
|
||||
data[0][i] = dx_data[i] * dx_data[i];
|
||||
data[1][i] = dx_data[i] * dy_data[i];
|
||||
data[2][i] = dy_data[i] * dy_data[i];
|
||||
|
||||
@@ -56,19 +56,19 @@ inline float calc(int x, int y)
|
||||
{
|
||||
return (float)abs(x) + abs(y);
|
||||
}
|
||||
#endif //
|
||||
#endif //
|
||||
|
||||
// Smoothing perpendicular to the derivative direction with a triangle filter
|
||||
// only support 3x3 Sobel kernel
|
||||
// only support 3x3 Sobel kernel
|
||||
// h (-1) = 1, h (0) = 2, h (1) = 1
|
||||
// h'(-1) = -1, h'(0) = 0, h'(1) = 1
|
||||
// thus sobel 2D operator can be calculated as:
|
||||
// h'(x, y) = h'(x)h(y) for x direction
|
||||
//
|
||||
//
|
||||
// src input 8bit single channel image data
|
||||
// dx_buf output dx buffer
|
||||
// dy_buf output dy buffer
|
||||
__kernel
|
||||
__kernel
|
||||
void calcSobelRowPass
|
||||
(
|
||||
__global const uchar * src,
|
||||
@@ -99,11 +99,11 @@ __kernel
|
||||
|
||||
__local int smem[16][18];
|
||||
|
||||
smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset];
|
||||
smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset];
|
||||
if(lidx == 0)
|
||||
{
|
||||
smem[lidy][0] = src[max(gidx - 1, 0) + gidy * src_step + src_offset];
|
||||
smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset];
|
||||
smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -122,7 +122,7 @@ __kernel
|
||||
|
||||
// calculate the magnitude of the filter pass combining both x and y directions
|
||||
// This is the buffered version(3x3 sobel)
|
||||
//
|
||||
//
|
||||
// dx_buf dx buffer, calculated from calcSobelRowPass
|
||||
// dy_buf dy buffer, calculated from calcSobelRowPass
|
||||
// dx direvitive in x direction output
|
||||
@@ -169,7 +169,7 @@ __kernel
|
||||
|
||||
__local int sdx[18][16];
|
||||
__local int sdy[18][16];
|
||||
|
||||
|
||||
sdx[lidy + 1][lidx] = dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset];
|
||||
sdy[lidy + 1][lidx] = dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset];
|
||||
if(lidy == 0)
|
||||
@@ -199,7 +199,7 @@ __kernel
|
||||
|
||||
// calculate the magnitude of the filter pass combining both x and y directions
|
||||
// This is the non-buffered version(non-3x3 sobel)
|
||||
//
|
||||
//
|
||||
// dx_buf dx buffer, calculated from calcSobelRowPass
|
||||
// dy_buf dy buffer, calculated from calcSobelRowPass
|
||||
// dx direvitive in x direction output
|
||||
@@ -233,9 +233,9 @@ __kernel
|
||||
|
||||
if(gidy < rows && gidx < cols)
|
||||
{
|
||||
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
|
||||
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
|
||||
calc(
|
||||
dx[gidx + gidy * dx_step + dx_offset],
|
||||
dx[gidx + gidy * dx_step + dx_offset],
|
||||
dy[gidx + gidy * dy_step + dy_offset]
|
||||
);
|
||||
}
|
||||
@@ -251,7 +251,7 @@ __kernel
|
||||
// 0 - below low thres, not an edge
|
||||
// 1 - maybe an edge
|
||||
// 2 - is an edge, either magnitude is greater than high thres, or
|
||||
// Given estimates of the image gradients, a search is then carried out
|
||||
// Given estimates of the image gradients, a search is then carried out
|
||||
// to determine if the gradient magnitude assumes a local maximum in the gradient direction.
|
||||
// if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
|
||||
// if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
|
||||
@@ -265,7 +265,7 @@ __kernel
|
||||
void calcMap
|
||||
(
|
||||
__global const int * dx,
|
||||
__global const int * dy,
|
||||
__global const int * dy,
|
||||
__global const float * mag,
|
||||
__global int * map,
|
||||
int rows,
|
||||
@@ -362,10 +362,10 @@ __kernel
|
||||
|
||||
// non local memory version
|
||||
__kernel
|
||||
void calcMap_2
|
||||
void calcMap_2
|
||||
(
|
||||
__global const int * dx,
|
||||
__global const int * dy,
|
||||
__global const int * dy,
|
||||
__global const float * mag,
|
||||
__global int * map,
|
||||
int rows,
|
||||
@@ -444,7 +444,7 @@ __kernel
|
||||
void calcMap_3
|
||||
(
|
||||
__global const int * dx,
|
||||
__global const int * dy,
|
||||
__global const int * dy,
|
||||
__global const float * mag,
|
||||
__global int * map,
|
||||
int rows,
|
||||
@@ -550,9 +550,9 @@ __kernel
|
||||
//
|
||||
// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
|
||||
// marked as edge. Each thread will iterate for 16 times to connect local edges.
|
||||
// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
|
||||
// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
|
||||
// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
|
||||
//
|
||||
//
|
||||
// map raw edge type results calculated from calcMap.
|
||||
// st the potiential edge points found in this kernel call
|
||||
// counter the number of potiential edge points
|
||||
@@ -560,7 +560,7 @@ __kernel
|
||||
void edgesHysteresisLocal
|
||||
(
|
||||
__global int * map,
|
||||
__global ushort2 * st,
|
||||
__global ushort2 * st,
|
||||
volatile __global unsigned int * counter,
|
||||
int rows,
|
||||
int cols,
|
||||
@@ -657,8 +657,8 @@ __kernel
|
||||
void edgesHysteresisGlobal
|
||||
(
|
||||
__global int * map,
|
||||
__global ushort2 * st1,
|
||||
__global ushort2 * st2,
|
||||
__global ushort2 * st1,
|
||||
__global ushort2 * st2,
|
||||
volatile __global int * counter,
|
||||
int rows,
|
||||
int cols,
|
||||
|
||||
@@ -57,24 +57,24 @@
|
||||
/// CV_32FC1
|
||||
__kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
|
||||
srcStep >>= 2;
|
||||
dstStep >>= 2;
|
||||
const int x = get_global_id(0);
|
||||
|
||||
if (x < srcCols)
|
||||
srcStep >>= 2;
|
||||
dstStep >>= 2;
|
||||
|
||||
if (x < srcCols)
|
||||
{
|
||||
int srcIdx = x ;
|
||||
int dstIdx = x ;
|
||||
int srcIdx = x ;
|
||||
int dstIdx = x ;
|
||||
|
||||
float sum = 0;
|
||||
|
||||
|
||||
for (int y = 0; y < srcRows; ++y)
|
||||
{
|
||||
sum += src[srcIdx];
|
||||
sum += src[srcIdx];
|
||||
dst[dstIdx] = sum;
|
||||
srcIdx += srcStep;
|
||||
dstIdx += dstStep;
|
||||
srcIdx += srcStep;
|
||||
dstIdx += dstStep;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,7 +53,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
|
||||
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight)
|
||||
{
|
||||
__local float smem[16 + 2 * 8][16 + 2 * 8];
|
||||
|
||||
|
||||
int x = get_local_id(0);
|
||||
int y = get_local_id(1);
|
||||
int gx = get_global_id(0);
|
||||
@@ -92,7 +92,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
|
||||
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
if (gx < cols && gy < rows)
|
||||
{
|
||||
float res = 0;
|
||||
|
||||
@@ -65,136 +65,136 @@
|
||||
#endif
|
||||
|
||||
__kernel void copymakeborder
|
||||
(__global const GENTYPE *src,
|
||||
__global GENTYPE *dst,
|
||||
(__global const GENTYPE *src,
|
||||
__global GENTYPE *dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_cols,
|
||||
const int src_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_in_pixel,
|
||||
const int dst_step_in_pixel,
|
||||
const int dst_offset_in_pixel,
|
||||
const int dst_rows,
|
||||
const int src_cols,
|
||||
const int src_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_in_pixel,
|
||||
const int dst_step_in_pixel,
|
||||
const int dst_offset_in_pixel,
|
||||
const int top,
|
||||
const int left,
|
||||
const GENTYPE val
|
||||
const int left,
|
||||
const GENTYPE val
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int src_x = x-left;
|
||||
int src_y = y-top;
|
||||
int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
|
||||
int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
int con = (src_x >= 0) && (src_x < src_cols) && (src_y >= 0) && (src_y < src_rows);
|
||||
if(con)
|
||||
{
|
||||
dst[dst_addr] = src[src_addr];
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
//write the result to dst
|
||||
if((x<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
dst[dst_addr] = val;
|
||||
}
|
||||
#else
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
s_x= ADDR_L(src_x,0,src_cols,src_x);
|
||||
s_x= ADDR_R(src_x,src_cols,s_x);
|
||||
s_y= ADDR_L(src_y,0,src_rows,src_y);
|
||||
s_y= ADDR_R(src_y,src_rows,s_y);
|
||||
src_addr=mad24(s_y,src_step_in_pixel,s_x+src_offset_in_pixel);
|
||||
//write the result to dst
|
||||
if((x<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
dst[dst_addr] = src[src_addr];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int src_x = x-left;
|
||||
int src_y = y-top;
|
||||
int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
|
||||
int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
int con = (src_x >= 0) && (src_x < src_cols) && (src_y >= 0) && (src_y < src_rows);
|
||||
if(con)
|
||||
{
|
||||
dst[dst_addr] = src[src_addr];
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
//write the result to dst
|
||||
if((x<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
dst[dst_addr] = val;
|
||||
}
|
||||
#else
|
||||
int s_x,s_y;
|
||||
//judge if read out of boundary
|
||||
s_x= ADDR_L(src_x,0,src_cols,src_x);
|
||||
s_x= ADDR_R(src_x,src_cols,s_x);
|
||||
s_y= ADDR_L(src_y,0,src_rows,src_y);
|
||||
s_y= ADDR_R(src_y,src_rows,s_y);
|
||||
src_addr=mad24(s_y,src_step_in_pixel,s_x+src_offset_in_pixel);
|
||||
//write the result to dst
|
||||
if((x<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
dst[dst_addr] = src[src_addr];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void copymakeborder_C1_D0
|
||||
(__global const uchar *src,
|
||||
__global uchar *dst,
|
||||
(__global const uchar *src,
|
||||
__global uchar *dst,
|
||||
const int dst_cols,
|
||||
const int dst_rows,
|
||||
const int src_cols,
|
||||
const int src_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_in_pixel,
|
||||
const int dst_step_in_pixel,
|
||||
const int dst_offset_in_pixel,
|
||||
const int dst_rows,
|
||||
const int src_cols,
|
||||
const int src_rows,
|
||||
const int src_step_in_pixel,
|
||||
const int src_offset_in_pixel,
|
||||
const int dst_step_in_pixel,
|
||||
const int dst_offset_in_pixel,
|
||||
const int top,
|
||||
const int left,
|
||||
const uchar val
|
||||
const int left,
|
||||
const uchar val
|
||||
)
|
||||
{
|
||||
int x = get_global_id(0)<<2;
|
||||
int y = get_global_id(1);
|
||||
int src_x = x-left;
|
||||
int src_y = y-top;
|
||||
int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
|
||||
int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
int con = (src_x >= 0) && (src_x+3 < src_cols) && (src_y >= 0) && (src_y < src_rows);
|
||||
if(con)
|
||||
{
|
||||
uchar4 tmp = vload4(0,src+src_addr);
|
||||
*(__global uchar4*)(dst+dst_addr) = tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
//write the result to dst
|
||||
if((((src_x<0) && (src_x+3>=0))||(src_x < src_cols) && (src_x+3 >= src_cols)) && (src_y >= 0) && (src_y < src_rows))
|
||||
{
|
||||
int4 addr;
|
||||
uchar4 tmp;
|
||||
addr.x = ((src_x < 0) || (src_x>= src_cols)) ? 0 : src_addr;
|
||||
addr.y = ((src_x+1 < 0) || (src_x+1>= src_cols)) ? 0 : (src_addr+1);
|
||||
addr.z = ((src_x+2 < 0) || (src_x+2>= src_cols)) ? 0 : (src_addr+2);
|
||||
addr.w = ((src_x+3 < 0) || (src_x+3>= src_cols)) ? 0 : (src_addr+3);
|
||||
tmp.x = src[addr.x];
|
||||
tmp.y = src[addr.y];
|
||||
tmp.z = src[addr.z];
|
||||
tmp.w = src[addr.w];
|
||||
tmp.x = (src_x >=0)&&(src_x < src_cols) ? tmp.x : val;
|
||||
tmp.y = (src_x+1 >=0)&&(src_x +1 < src_cols) ? tmp.y : val;
|
||||
tmp.z = (src_x+2 >=0)&&(src_x +2 < src_cols) ? tmp.z : val;
|
||||
tmp.w = (src_x+3 >=0)&&(src_x +3 < src_cols) ? tmp.w : val;
|
||||
*(__global uchar4*)(dst+dst_addr) = tmp;
|
||||
}
|
||||
else if((x<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
*(__global uchar4*)(dst+dst_addr) = (uchar4)val;
|
||||
}
|
||||
#else
|
||||
int4 s_x;
|
||||
int s_y;
|
||||
//judge if read out of boundary
|
||||
s_x.x= ADDR_L(src_x,0,src_cols,src_x);
|
||||
s_x.y= ADDR_L(src_x+1,0,src_cols,src_x+1);
|
||||
s_x.z= ADDR_L(src_x+2,0,src_cols,src_x+2);
|
||||
s_x.w= ADDR_L(src_x+3,0,src_cols,src_x+3);
|
||||
s_x.x= ADDR_R(src_x,src_cols,s_x.x);
|
||||
s_x.y= ADDR_R(src_x+1,src_cols,s_x.y);
|
||||
s_x.z= ADDR_R(src_x+2,src_cols,s_x.z);
|
||||
s_x.w= ADDR_R(src_x+3,src_cols,s_x.w);
|
||||
s_y= ADDR_L(src_y,0,src_rows,src_y);
|
||||
s_y= ADDR_R(src_y,src_rows,s_y);
|
||||
int4 src_addr4=mad24((int4)s_y,(int4)src_step_in_pixel,s_x+(int4)src_offset_in_pixel);
|
||||
//write the result to dst
|
||||
if((x<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.x = src[src_addr4.x];
|
||||
tmp.y = src[src_addr4.y];
|
||||
tmp.z = src[src_addr4.z];
|
||||
tmp.w = src[src_addr4.w];
|
||||
*(__global uchar4*)(dst+dst_addr) = tmp;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
int x = get_global_id(0)<<2;
|
||||
int y = get_global_id(1);
|
||||
int src_x = x-left;
|
||||
int src_y = y-top;
|
||||
int src_addr = mad24(src_y,src_step_in_pixel,src_x+src_offset_in_pixel);
|
||||
int dst_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
|
||||
int con = (src_x >= 0) && (src_x+3 < src_cols) && (src_y >= 0) && (src_y < src_rows);
|
||||
if(con)
|
||||
{
|
||||
uchar4 tmp = vload4(0,src+src_addr);
|
||||
*(__global uchar4*)(dst+dst_addr) = tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
//write the result to dst
|
||||
if((((src_x<0) && (src_x+3>=0))||(src_x < src_cols) && (src_x+3 >= src_cols)) && (src_y >= 0) && (src_y < src_rows))
|
||||
{
|
||||
int4 addr;
|
||||
uchar4 tmp;
|
||||
addr.x = ((src_x < 0) || (src_x>= src_cols)) ? 0 : src_addr;
|
||||
addr.y = ((src_x+1 < 0) || (src_x+1>= src_cols)) ? 0 : (src_addr+1);
|
||||
addr.z = ((src_x+2 < 0) || (src_x+2>= src_cols)) ? 0 : (src_addr+2);
|
||||
addr.w = ((src_x+3 < 0) || (src_x+3>= src_cols)) ? 0 : (src_addr+3);
|
||||
tmp.x = src[addr.x];
|
||||
tmp.y = src[addr.y];
|
||||
tmp.z = src[addr.z];
|
||||
tmp.w = src[addr.w];
|
||||
tmp.x = (src_x >=0)&&(src_x < src_cols) ? tmp.x : val;
|
||||
tmp.y = (src_x+1 >=0)&&(src_x +1 < src_cols) ? tmp.y : val;
|
||||
tmp.z = (src_x+2 >=0)&&(src_x +2 < src_cols) ? tmp.z : val;
|
||||
tmp.w = (src_x+3 >=0)&&(src_x +3 < src_cols) ? tmp.w : val;
|
||||
*(__global uchar4*)(dst+dst_addr) = tmp;
|
||||
}
|
||||
else if((x<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
*(__global uchar4*)(dst+dst_addr) = (uchar4)val;
|
||||
}
|
||||
#else
|
||||
int4 s_x;
|
||||
int s_y;
|
||||
//judge if read out of boundary
|
||||
s_x.x= ADDR_L(src_x,0,src_cols,src_x);
|
||||
s_x.y= ADDR_L(src_x+1,0,src_cols,src_x+1);
|
||||
s_x.z= ADDR_L(src_x+2,0,src_cols,src_x+2);
|
||||
s_x.w= ADDR_L(src_x+3,0,src_cols,src_x+3);
|
||||
s_x.x= ADDR_R(src_x,src_cols,s_x.x);
|
||||
s_x.y= ADDR_R(src_x+1,src_cols,s_x.y);
|
||||
s_x.z= ADDR_R(src_x+2,src_cols,s_x.z);
|
||||
s_x.w= ADDR_R(src_x+3,src_cols,s_x.w);
|
||||
s_y= ADDR_L(src_y,0,src_rows,src_y);
|
||||
s_y= ADDR_R(src_y,src_rows,s_y);
|
||||
int4 src_addr4=mad24((int4)s_y,(int4)src_step_in_pixel,s_x+(int4)src_offset_in_pixel);
|
||||
//write the result to dst
|
||||
if((x<dst_cols) && (y<dst_rows))
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.x = src[src_addr4.x];
|
||||
tmp.y = src[src_addr4.y];
|
||||
tmp.z = src[src_addr4.z];
|
||||
tmp.w = src[src_addr4.w];
|
||||
*(__global uchar4*)(dst+dst_addr) = tmp;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//
|
||||
#define PARTIAL_HISTOGRAM256_COUNT (256)
|
||||
#define PARTIAL_HISTOGRAM256_COUNT (256)
|
||||
#define HISTOGRAM256_BIN_COUNT (256)
|
||||
|
||||
#define HISTOGRAM256_WORK_GROUP_SIZE (256)
|
||||
@@ -45,12 +45,12 @@
|
||||
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
|
||||
__global const uint4* src,
|
||||
int src_step, int src_offset,
|
||||
__global const uint4* src,
|
||||
int src_step, int src_offset,
|
||||
__global int* globalHist,
|
||||
int dataCount, int cols,
|
||||
int inc_x, int inc_y,
|
||||
int hist_step)
|
||||
int dataCount, int cols,
|
||||
int inc_x, int inc_y,
|
||||
int hist_step)
|
||||
{
|
||||
__local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
|
||||
int gid = get_global_id(0);
|
||||
@@ -63,7 +63,7 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
|
||||
int offset = (lid & (NBANKS-1));// lid % NBANKS
|
||||
uint4 data, temp1, temp2, temp3, temp4;
|
||||
src += src_offset;
|
||||
|
||||
|
||||
//clear LDS
|
||||
for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
|
||||
{
|
||||
@@ -73,7 +73,7 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
|
||||
subhist[idx+=lsize] = 0;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
//read and scatter
|
||||
int y = gid/cols;
|
||||
int x = gid - mul24(y, cols);
|
||||
@@ -87,35 +87,35 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
|
||||
temp3 = ((data & mask) << NBANKS_BIT) + offset;
|
||||
data >>= shift;
|
||||
temp4 = ((data & mask) << NBANKS_BIT) + offset;
|
||||
|
||||
atomic_inc(subhist + temp1.x);
|
||||
atomic_inc(subhist + temp1.y);
|
||||
atomic_inc(subhist + temp1.z);
|
||||
atomic_inc(subhist + temp1.w);
|
||||
|
||||
atomic_inc(subhist + temp2.x);
|
||||
atomic_inc(subhist + temp2.y);
|
||||
atomic_inc(subhist + temp2.z);
|
||||
atomic_inc(subhist + temp2.w);
|
||||
|
||||
atomic_inc(subhist + temp3.x);
|
||||
atomic_inc(subhist + temp3.y);
|
||||
atomic_inc(subhist + temp3.z);
|
||||
atomic_inc(subhist + temp3.w);
|
||||
|
||||
atomic_inc(subhist + temp4.x);
|
||||
atomic_inc(subhist + temp4.y);
|
||||
atomic_inc(subhist + temp4.z);
|
||||
|
||||
atomic_inc(subhist + temp1.x);
|
||||
atomic_inc(subhist + temp1.y);
|
||||
atomic_inc(subhist + temp1.z);
|
||||
atomic_inc(subhist + temp1.w);
|
||||
|
||||
atomic_inc(subhist + temp2.x);
|
||||
atomic_inc(subhist + temp2.y);
|
||||
atomic_inc(subhist + temp2.z);
|
||||
atomic_inc(subhist + temp2.w);
|
||||
|
||||
atomic_inc(subhist + temp3.x);
|
||||
atomic_inc(subhist + temp3.y);
|
||||
atomic_inc(subhist + temp3.z);
|
||||
atomic_inc(subhist + temp3.w);
|
||||
|
||||
atomic_inc(subhist + temp4.x);
|
||||
atomic_inc(subhist + temp4.y);
|
||||
atomic_inc(subhist + temp4.z);
|
||||
atomic_inc(subhist + temp4.w);
|
||||
|
||||
|
||||
x += inc_x;
|
||||
int off = ((x>=cols) ? -1 : 0);
|
||||
x = mad24(off, cols, x);
|
||||
y += inc_y - off;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce local banks to single histogram per workgroup
|
||||
|
||||
//reduce local banks to single histogram per workgroup
|
||||
int bin1=0, bin2=0, bin3=0, bin4=0;
|
||||
for(int i=0; i<NBANKS; i+=4)
|
||||
{
|
||||
@@ -124,19 +124,19 @@ __kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void c
|
||||
bin3 += subhist[(lid << NBANKS_BIT) + i+2];
|
||||
bin4 += subhist[(lid << NBANKS_BIT) + i+3];
|
||||
}
|
||||
|
||||
|
||||
globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
|
||||
}
|
||||
|
||||
__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))calc_sub_hist_border_D0(
|
||||
__global const uchar* src,
|
||||
int src_step, int src_offset,
|
||||
__global const uchar* src,
|
||||
int src_step, int src_offset,
|
||||
__global int* globalHist,
|
||||
int left_col, int cols,
|
||||
int rows, int hist_step)
|
||||
int rows, int hist_step)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
int lidy = get_local_id(1);
|
||||
int gx = get_group_id(0);
|
||||
int gy = get_group_id(1);
|
||||
@@ -160,9 +160,9 @@ __kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))c
|
||||
|
||||
globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
|
||||
}
|
||||
__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
|
||||
__global int* hist,
|
||||
int src_step)
|
||||
__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
|
||||
__global int* hist,
|
||||
int src_step)
|
||||
{
|
||||
int lx = get_local_id(0);
|
||||
int gx = get_group_id(0);
|
||||
@@ -183,83 +183,83 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global
|
||||
}
|
||||
|
||||
if(lx == 0)
|
||||
hist[gx] = data[0];
|
||||
hist[gx] = data[0];
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(256,1,1)))void calLUT(
|
||||
__global uchar * dst,
|
||||
__constant int * hist,
|
||||
float scale)
|
||||
__global uchar * dst,
|
||||
__constant int * hist,
|
||||
float scale)
|
||||
{
|
||||
int lid = get_local_id(0);
|
||||
__local int sumhist[HISTOGRAM256_BIN_COUNT];
|
||||
//__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
|
||||
int lid = get_local_id(0);
|
||||
__local int sumhist[HISTOGRAM256_BIN_COUNT];
|
||||
//__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
|
||||
|
||||
sumhist[lid]=hist[lid];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid==0)
|
||||
{
|
||||
int sum = 0;
|
||||
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
|
||||
{
|
||||
sum+=sumhist[i];
|
||||
sumhist[i]=sum;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
dst[lid]= lid == 0 ? 0 : convert_uchar_sat(convert_float(sumhist[lid])*scale);
|
||||
sumhist[lid]=hist[lid];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid==0)
|
||||
{
|
||||
int sum = 0;
|
||||
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
|
||||
{
|
||||
sum+=sumhist[i];
|
||||
sumhist[i]=sum;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
dst[lid]= lid == 0 ? 0 : convert_uchar_sat(convert_float(sumhist[lid])*scale);
|
||||
}
|
||||
/*
|
||||
///////////////////////////////equalizeHist//////////////////////////////////////////////////
|
||||
__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
|
||||
__global uchar * src,
|
||||
__global uchar * dst,
|
||||
__constant int * hist,
|
||||
int srcstep,
|
||||
int srcoffset,
|
||||
int dststep,
|
||||
int dstoffset,
|
||||
int width,
|
||||
int height,
|
||||
float scale,
|
||||
int inc_x,
|
||||
int inc_y)
|
||||
__global uchar * src,
|
||||
__global uchar * dst,
|
||||
__constant int * hist,
|
||||
int srcstep,
|
||||
int srcoffset,
|
||||
int dststep,
|
||||
int dstoffset,
|
||||
int width,
|
||||
int height,
|
||||
float scale,
|
||||
int inc_x,
|
||||
int inc_y)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int glb_size = get_global_size(0);
|
||||
src+=srcoffset;
|
||||
dst+=dstoffset;
|
||||
__local int sumhist[HISTOGRAM256_BIN_COUNT];
|
||||
__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
|
||||
int gidx = get_global_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int glb_size = get_global_size(0);
|
||||
src+=srcoffset;
|
||||
dst+=dstoffset;
|
||||
__local int sumhist[HISTOGRAM256_BIN_COUNT];
|
||||
__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
|
||||
|
||||
sumhist[lid]=hist[lid];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid==0)
|
||||
{
|
||||
int sum = 0;
|
||||
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
|
||||
{
|
||||
sum+=sumhist[i];
|
||||
sumhist[i]=sum;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
|
||||
lut[0]=0;
|
||||
sumhist[lid]=hist[lid];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid==0)
|
||||
{
|
||||
int sum = 0;
|
||||
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
|
||||
{
|
||||
sum+=sumhist[i];
|
||||
sumhist[i]=sum;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
|
||||
lut[0]=0;
|
||||
int pos_y = gidx / width;
|
||||
int pos_x = gidx - mul24(pos_y, width);
|
||||
|
||||
for(int pos = gidx; pos < mul24(width,height); pos += glb_size)
|
||||
{
|
||||
int inaddr = mad24(pos_y,srcstep,pos_x);
|
||||
int outaddr = mad24(pos_y,dststep,pos_x);
|
||||
dst[outaddr] = lut[src[inaddr]];
|
||||
pos_x +=inc_x;
|
||||
int off = (pos_x >= width ? -1 : 0);
|
||||
pos_x = mad24(off,width,pos_x);
|
||||
pos_y += inc_y - off;
|
||||
}
|
||||
{
|
||||
int inaddr = mad24(pos_y,srcstep,pos_x);
|
||||
int outaddr = mad24(pos_y,dststep,pos_x);
|
||||
dst[outaddr] = lut[src[inaddr]];
|
||||
pos_x +=inc_x;
|
||||
int off = (pos_x >= width ? -1 : 0);
|
||||
pos_x = mad24(off,width,pos_x);
|
||||
pos_y += inc_y - off;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
@@ -73,27 +73,27 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
|
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
|
||||
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
|
||||
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
@@ -102,7 +102,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
@@ -113,23 +113,23 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
|
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
||||
if(lid > 0 && (i+lid) <= rows){
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
lm_sqsum[1][bf_loc] += sqsum_t[1];
|
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
|
||||
@@ -139,7 +139,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
|
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
|
||||
sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
|
||||
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
@@ -147,7 +147,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
|
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
|
||||
sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
@@ -173,27 +173,27 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0;
|
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
|
||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0;
|
||||
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sqsum[0][bf_loc] = sqsrc_t[0];
|
||||
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
lm_sqsum[1][bf_loc] = sqsrc_t[1];
|
||||
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
@@ -202,7 +202,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
@@ -213,14 +213,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
|
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
@@ -235,7 +235,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
{
|
||||
int loc0 = gid * 2 * sum_step;
|
||||
int loc1 = gid * 2 * sqsum_step;
|
||||
for(int k = 1;k <= 8;k++)
|
||||
for(int k = 1;k <= 8;k++)
|
||||
{
|
||||
if(gid * 8 + k > cols) break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
@@ -245,8 +245,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
|
||||
if(lid > 0 && (i+lid) <= rows){
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
lm_sqsum[1][bf_loc] += sqsum_t[1];
|
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
|
||||
@@ -256,7 +256,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
if(gid * 8 + k >= cols) break;
|
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
|
||||
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
@@ -264,7 +264,7 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
if(gid * 8 + 4 + k >= cols) break;
|
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
@@ -70,23 +70,23 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
|
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
|
||||
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
@@ -94,7 +94,7 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
@@ -104,32 +104,32 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid > 0 && (i+lid) <= rows){
|
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
|
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
|
||||
}
|
||||
}
|
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
|
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
@@ -150,23 +150,23 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
|
||||
{
|
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
|
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
|
||||
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
@@ -174,7 +174,7 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
@@ -184,9 +184,9 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
@@ -201,13 +201,13 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
|
||||
if(i + lid == 0)
|
||||
{
|
||||
int loc0 = gid * 2 * sum_step;
|
||||
for(int k = 1;k <= 8;k++)
|
||||
for(int k = 1;k <= 8;k++)
|
||||
{
|
||||
if(gid * 8 + k > cols) break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(lid > 0 && (i+lid) <= rows){
|
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
@@ -223,7 +223,7 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
|
||||
{
|
||||
if(gid * 8 + 4 + k >= cols) break;
|
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
@@ -39,75 +39,75 @@
|
||||
__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep, int m)
|
||||
{
|
||||
int dx = get_global_id(0)-(m>>1);
|
||||
int dx = get_global_id(0)-(m>>1);
|
||||
int dy = get_global_id(1)-(m>>1);
|
||||
|
||||
short histom[256];
|
||||
for(int i=0;i<256;++i)
|
||||
histom[i]=0;
|
||||
|
||||
|
||||
for(int i=0;i<m;++i)
|
||||
{
|
||||
__global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
|
||||
for(int j=dx;j<dx+m;++j)
|
||||
{
|
||||
histom[data[clamp(j, 0, cols-1)]]++;
|
||||
}
|
||||
}
|
||||
short histom[256];
|
||||
for(int i=0;i<256;++i)
|
||||
histom[i]=0;
|
||||
|
||||
int now=0;
|
||||
int goal=(m*m+1)>>1;
|
||||
int v;
|
||||
for(int i=0;i<256;++i)
|
||||
{
|
||||
v=(now<goal?i:v);
|
||||
now+=histom[i];
|
||||
}
|
||||
|
||||
if(dy<rows && dx<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
|
||||
|
||||
for(int i=0;i<m;++i)
|
||||
{
|
||||
__global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
|
||||
for(int j=dx;j<dx+m;++j)
|
||||
{
|
||||
histom[data[clamp(j, 0, cols-1)]]++;
|
||||
}
|
||||
}
|
||||
|
||||
int now=0;
|
||||
int goal=(m*m+1)>>1;
|
||||
int v;
|
||||
for(int i=0;i<256;++i)
|
||||
{
|
||||
v=(now<goal?i:v);
|
||||
now+=histom[i];
|
||||
}
|
||||
|
||||
if(dy<rows && dx<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
|
||||
}
|
||||
*/
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
|
||||
__kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep)
|
||||
{
|
||||
|
||||
__local uchar4 data[18][18];
|
||||
__global uchar4* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1;
|
||||
__local uchar4 data[18][18];
|
||||
__global uchar4* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1;
|
||||
int dy = get_global_id(1) - get_local_id(1) -1;
|
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
|
||||
|
||||
int dr=id/18;
|
||||
int dc=id%18;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
|
||||
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+9, 0, rows-1);
|
||||
data[dr+9][dc] = source[r*srcStep + c];
|
||||
int dr=id/18;
|
||||
int dc=id%18;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+9, 0, rows-1);
|
||||
data[dr+9][dc] = source[r*srcStep + c];
|
||||
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
|
||||
uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
|
||||
uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
|
||||
uchar4 mid;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
|
||||
uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
|
||||
uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
|
||||
uchar4 mid;
|
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
|
||||
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
|
||||
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
|
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
|
||||
op(p4, p2); op(p6, p4); op(p4, p2);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
||||
@@ -115,41 +115,41 @@ __kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,
|
||||
__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep)
|
||||
{
|
||||
|
||||
__local uchar data[18][18];
|
||||
__global uchar* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1;
|
||||
__local uchar data[18][18];
|
||||
__global uchar* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1;
|
||||
int dy = get_global_id(1) - get_local_id(1) -1;
|
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
|
||||
|
||||
int dr=id/18;
|
||||
int dc=id%18;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
|
||||
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+9, 0, rows-1);
|
||||
data[dr+9][dc] = source[r*srcStep + c];
|
||||
int dr=id/18;
|
||||
int dc=id%18;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+9, 0, rows-1);
|
||||
data[dr+9][dc] = source[r*srcStep + c];
|
||||
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
|
||||
uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
|
||||
uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
|
||||
uchar mid;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
|
||||
uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
|
||||
uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
|
||||
uchar mid;
|
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
|
||||
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
|
||||
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
|
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
|
||||
op(p4, p2); op(p6, p4); op(p4, p2);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
||||
@@ -157,41 +157,41 @@ __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, i
|
||||
__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep)
|
||||
{
|
||||
|
||||
__local float data[18][18];
|
||||
__global float* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1;
|
||||
__local float data[18][18];
|
||||
__global float* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1;
|
||||
int dy = get_global_id(1) - get_local_id(1) -1;
|
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
|
||||
|
||||
int dr=id/18;
|
||||
int dc=id%18;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
|
||||
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+9, 0, rows-1);
|
||||
data[dr+9][dc] = source[r*srcStep + c];
|
||||
int dr=id/18;
|
||||
int dc=id%18;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+9, 0, rows-1);
|
||||
data[dr+9][dc] = source[r*srcStep + c];
|
||||
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
|
||||
float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
|
||||
float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
|
||||
float mid;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
|
||||
float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
|
||||
float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
|
||||
float mid;
|
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
|
||||
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
|
||||
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
|
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
|
||||
op(p4, p2); op(p6, p4); op(p4, p2);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
||||
@@ -199,41 +199,41 @@ __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, i
|
||||
__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep)
|
||||
{
|
||||
|
||||
__local float4 data[18][18];
|
||||
__global float4* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1;
|
||||
__local float4 data[18][18];
|
||||
__global float4* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1;
|
||||
int dy = get_global_id(1) - get_local_id(1) -1;
|
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
|
||||
|
||||
int dr=id/18;
|
||||
int dc=id%18;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
|
||||
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+9, 0, rows-1);
|
||||
data[dr+9][dc] = source[r*srcStep + c];
|
||||
int dr=id/18;
|
||||
int dc=id%18;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+9, 0, rows-1);
|
||||
data[dr+9][dc] = source[r*srcStep + c];
|
||||
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
|
||||
float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
|
||||
float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
|
||||
float4 mid;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
|
||||
float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
|
||||
float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
|
||||
float4 mid;
|
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
|
||||
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
|
||||
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
|
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
|
||||
op(p4, p2); op(p6, p4); op(p4, p2);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
||||
@@ -241,36 +241,36 @@ __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,
|
||||
__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep)
|
||||
{
|
||||
|
||||
__local uchar4 data[20][20];
|
||||
__global uchar4* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2;
|
||||
__local uchar4 data[20][20];
|
||||
__global uchar4* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2;
|
||||
int dy = get_global_id(1) - get_local_id(1) -2;
|
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
|
||||
|
||||
int dr=id/20;
|
||||
int dc=id%20;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
|
||||
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+10, 0, rows-1);
|
||||
data[dr+10][dc] = source[r*srcStep + c];
|
||||
int dr=id/20;
|
||||
int dc=id%20;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+10, 0, rows-1);
|
||||
data[dr+10][dc] = source[r*srcStep + c];
|
||||
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
|
||||
uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
|
||||
uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
|
||||
uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
|
||||
uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
|
||||
uchar4 mid;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
|
||||
uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
|
||||
uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
|
||||
uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
|
||||
uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
|
||||
uchar4 mid;
|
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
|
||||
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
|
||||
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
|
||||
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
|
||||
@@ -293,9 +293,9 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
|
||||
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
|
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
|
||||
op(p7, p11); op(p11, p13); op(p11, p12);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
||||
@@ -303,36 +303,36 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
|
||||
__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep)
|
||||
{
|
||||
|
||||
__local uchar data[20][20];
|
||||
__global uchar* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2;
|
||||
__local uchar data[20][20];
|
||||
__global uchar* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2;
|
||||
int dy = get_global_id(1) - get_local_id(1) -2;
|
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
|
||||
|
||||
int dr=id/20;
|
||||
int dc=id%20;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
|
||||
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+10, 0, rows-1);
|
||||
data[dr+10][dc] = source[r*srcStep + c];
|
||||
int dr=id/20;
|
||||
int dc=id%20;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+10, 0, rows-1);
|
||||
data[dr+10][dc] = source[r*srcStep + c];
|
||||
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
|
||||
uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
|
||||
uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
|
||||
uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
|
||||
uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
|
||||
uchar mid;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
|
||||
uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
|
||||
uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
|
||||
uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
|
||||
uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
|
||||
uchar mid;
|
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
|
||||
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
|
||||
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
|
||||
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
|
||||
@@ -355,9 +355,9 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, i
|
||||
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
|
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
|
||||
op(p7, p11); op(p11, p13); op(p11, p12);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
||||
@@ -365,36 +365,36 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, i
|
||||
__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep)
|
||||
{
|
||||
|
||||
__local float4 data[20][20];
|
||||
__global float4* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2;
|
||||
__local float4 data[20][20];
|
||||
__global float4* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2;
|
||||
int dy = get_global_id(1) - get_local_id(1) -2;
|
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
|
||||
|
||||
int dr=id/20;
|
||||
int dc=id%20;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
|
||||
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+10, 0, rows-1);
|
||||
data[dr+10][dc] = source[r*srcStep + c];
|
||||
int dr=id/20;
|
||||
int dc=id%20;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+10, 0, rows-1);
|
||||
data[dr+10][dc] = source[r*srcStep + c];
|
||||
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
|
||||
float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
|
||||
float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
|
||||
float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
|
||||
float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
|
||||
float4 mid;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
|
||||
float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
|
||||
float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
|
||||
float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
|
||||
float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
|
||||
float4 mid;
|
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
|
||||
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
|
||||
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
|
||||
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
|
||||
@@ -417,9 +417,9 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
|
||||
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
|
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
|
||||
op(p7, p11); op(p11, p13); op(p11, p12);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
||||
@@ -427,36 +427,36 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
|
||||
__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
|
||||
int rows, int srcStep, int dstStep)
|
||||
{
|
||||
|
||||
__local float data[20][20];
|
||||
__global float* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2;
|
||||
__local float data[20][20];
|
||||
__global float* source=src + srcOffset;
|
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2;
|
||||
int dy = get_global_id(1) - get_local_id(1) -2;
|
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
|
||||
|
||||
int dr=id/20;
|
||||
int dc=id%20;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
|
||||
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+10, 0, rows-1);
|
||||
data[dr+10][dc] = source[r*srcStep + c];
|
||||
int dr=id/20;
|
||||
int dc=id%20;
|
||||
int r=clamp(dy+dr, 0, rows-1);
|
||||
int c=clamp(dx+dc, 0, cols-1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[dr][dc] = source[r*srcStep + c];
|
||||
r=clamp(dy+dr+10, 0, rows-1);
|
||||
data[dr+10][dc] = source[r*srcStep + c];
|
||||
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
|
||||
float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
|
||||
float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
|
||||
float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
|
||||
float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
|
||||
float mid;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
|
||||
int x =get_local_id(0);
|
||||
int y =get_local_id(1);
|
||||
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
|
||||
float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
|
||||
float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
|
||||
float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
|
||||
float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
|
||||
float mid;
|
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
|
||||
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
|
||||
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
|
||||
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
|
||||
@@ -479,9 +479,9 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, i
|
||||
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
|
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
|
||||
op(p7, p11); op(p11, p13); op(p11, p12);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
#if defined DOUBLE_SUPPORT
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
typedef double4 F4 ;
|
||||
#else
|
||||
#else
|
||||
typedef float4 F4;
|
||||
#endif
|
||||
|
||||
@@ -62,7 +62,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
@@ -79,7 +79,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
|
||||
|
||||
|
||||
uchar4 src_data;
|
||||
|
||||
src_data.s0 = *(src + srcIdx.s0);
|
||||
@@ -88,10 +88,10 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
src_data.s3 = *(src + srcIdx.s3);
|
||||
uchar4 dst_data;
|
||||
dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
|
||||
|
||||
|
||||
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *d;
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
|
||||
@@ -107,7 +107,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
@@ -125,7 +125,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
|
||||
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
|
||||
int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
|
||||
|
||||
|
||||
uchar4 src_data;
|
||||
|
||||
src_data.s0 = *(src + srcIdx.s0);
|
||||
@@ -136,10 +136,10 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
|
||||
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *d;
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
|
||||
|
||||
dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
|
||||
*d = dst_data;
|
||||
|
||||
@@ -152,7 +152,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
@@ -173,7 +173,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
|
||||
int8 map_dataZ = convert_int8_sat_rte(map_data);
|
||||
int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset;
|
||||
|
||||
|
||||
uchar4 src_data;
|
||||
|
||||
src_data.s0 = *(src + srcIdx.s0);
|
||||
@@ -184,10 +184,10 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
|
||||
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *d;
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
|
||||
|
||||
dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
|
||||
*d = dst_data;
|
||||
}
|
||||
@@ -230,7 +230,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
|
||||
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
|
||||
|
||||
uchar16 dVal = *d;
|
||||
uchar16 dVal = *d;
|
||||
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
|
||||
@@ -279,7 +279,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
|
||||
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
|
||||
|
||||
uchar16 dVal = *d;
|
||||
uchar16 dVal = *d;
|
||||
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
|
||||
@@ -333,7 +333,7 @@ __kernel void remapNNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
|
||||
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
|
||||
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
|
||||
|
||||
uchar16 dVal = *d;
|
||||
uchar16 dVal = *d;
|
||||
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
|
||||
@@ -351,9 +351,9 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
{
|
||||
x = x << 4;
|
||||
|
||||
int gx = x - (dst_offset&15);
|
||||
@@ -368,25 +368,25 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
short8 map1_data;
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
|
||||
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
|
||||
|
||||
|
||||
float4 src_data;
|
||||
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
|
||||
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
|
||||
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
|
||||
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
|
||||
float4 dst_data;
|
||||
|
||||
|
||||
dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
|
||||
dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
|
||||
dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
|
||||
dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
|
||||
|
||||
|
||||
|
||||
|
||||
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
|
||||
|
||||
float4 dVal = *d;
|
||||
float4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
@@ -402,7 +402,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 4;
|
||||
@@ -422,23 +422,23 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
|
||||
|
||||
int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
|
||||
|
||||
|
||||
float4 src_data;
|
||||
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
|
||||
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
|
||||
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
|
||||
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
|
||||
float4 dst_data;
|
||||
|
||||
|
||||
dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
|
||||
dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
|
||||
dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
|
||||
dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
|
||||
|
||||
|
||||
|
||||
|
||||
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
|
||||
|
||||
float4 dVal = *d;
|
||||
float4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
@@ -455,7 +455,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 4;
|
||||
@@ -478,23 +478,23 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
int8 map1_dataZ = convert_int8_sat_rte(map_data);
|
||||
|
||||
int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
|
||||
|
||||
|
||||
float4 src_data;
|
||||
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
|
||||
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
|
||||
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
|
||||
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
|
||||
float4 dst_data;
|
||||
|
||||
|
||||
dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
|
||||
dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
|
||||
dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
|
||||
dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
|
||||
|
||||
|
||||
|
||||
|
||||
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
|
||||
|
||||
float4 dVal = *d;
|
||||
float4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
@@ -577,13 +577,13 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
x = x << 2;
|
||||
int gx = x - (dst_offset&3);
|
||||
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
|
||||
|
||||
uchar4 nval =convert_uchar4(nVal);
|
||||
uchar val = nval.s0;
|
||||
|
||||
|
||||
|
||||
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
|
||||
|
||||
@@ -607,7 +607,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
|
||||
int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
|
||||
int4 src_StartD = src_StartU + src_step;
|
||||
/*
|
||||
/*
|
||||
//not using the vload
|
||||
int4 src_StartU1 = src_StartU + (int4)(1);
|
||||
int4 src_StartD1 = src_StartD + (int4)(1);
|
||||
@@ -617,7 +617,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
a.y = *(src_StartU.y + src);
|
||||
a.z = *(src_StartU.z + src);
|
||||
a.w = *(src_StartU.w + src);
|
||||
|
||||
|
||||
b.x = *(src_StartU1.x + src);
|
||||
b.y = *(src_StartU1.y + src);
|
||||
b.z = *(src_StartU1.z + src);
|
||||
@@ -649,7 +649,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
|
||||
c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
|
||||
d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
|
||||
|
||||
|
||||
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
|
||||
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
|
||||
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
|
||||
@@ -660,10 +660,10 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
d = (convert_uchar4(dc) == (uchar4)(0))? d : val;
|
||||
|
||||
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
|
||||
|
||||
|
||||
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *D;
|
||||
uchar4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
|
||||
@@ -680,13 +680,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
x = x << 2;
|
||||
int gx = x - (dst_offset&3);
|
||||
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
|
||||
|
||||
uchar4 nval =convert_uchar4(nVal);
|
||||
uchar val = nval.s0;
|
||||
|
||||
|
||||
|
||||
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
|
||||
|
||||
@@ -713,7 +713,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
|
||||
int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
|
||||
int4 src_StartD = src_StartU + src_step;
|
||||
/*
|
||||
/*
|
||||
//not using the vload
|
||||
int4 src_StartU1 = src_StartU + (int4)(1);
|
||||
int4 src_StartD1 = src_StartD + (int4)(1);
|
||||
@@ -723,7 +723,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
a.y = *(src_StartU.y + src);
|
||||
a.z = *(src_StartU.z + src);
|
||||
a.w = *(src_StartU.w + src);
|
||||
|
||||
|
||||
b.x = *(src_StartU1.x + src);
|
||||
b.y = *(src_StartU1.y + src);
|
||||
b.z = *(src_StartU1.z + src);
|
||||
@@ -755,7 +755,7 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
|
||||
c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
|
||||
d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
|
||||
|
||||
|
||||
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
|
||||
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
|
||||
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
|
||||
@@ -766,10 +766,10 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
|
||||
d = (convert_uchar4(dc) == (uchar4)(0))? d : val;
|
||||
|
||||
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
|
||||
|
||||
|
||||
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *D;
|
||||
uchar4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
|
||||
@@ -784,7 +784,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 2;
|
||||
@@ -801,7 +801,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
|
||||
|
||||
|
||||
uchar4 src_data;
|
||||
|
||||
src_data.s0 = *(src + srcIdx.s0);
|
||||
@@ -810,10 +810,10 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
|
||||
src_data.s3 = *(src + srcIdx.s3);
|
||||
uchar4 dst_data;
|
||||
dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
|
||||
|
||||
|
||||
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
|
||||
|
||||
uchar4 dVal = *d;
|
||||
uchar4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
@@ -835,7 +835,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 4;
|
||||
x = x << 4;
|
||||
int gx = x - (dst_offset&15);
|
||||
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
|
||||
|
||||
@@ -854,7 +854,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
float4 v = temp.odd;
|
||||
float4 ud = (float4)(1.0) - u;
|
||||
float4 vd = (float4)(1.0) - v;
|
||||
|
||||
|
||||
//float8 map1_dataU = map1_dataD + 1;
|
||||
|
||||
int4 map1_dataDx = map1_dataD.even;
|
||||
@@ -888,7 +888,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
|
||||
int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
|
||||
int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
|
||||
|
||||
|
||||
uchar16 val = (uchar16)(nval, nval, nval, nval);
|
||||
a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
|
||||
b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
|
||||
@@ -901,10 +901,10 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));
|
||||
|
||||
uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
|
||||
|
||||
|
||||
__global uchar16* D = (__global uchar16 *)(dst + dstStart);
|
||||
|
||||
uchar16 dVal = *D;
|
||||
uchar16 dVal = *D;
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
|
||||
|
||||
@@ -922,7 +922,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 4;
|
||||
x = x << 4;
|
||||
int gx = x - (dst_offset&15);
|
||||
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
|
||||
|
||||
@@ -944,7 +944,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
|
||||
float4 v = temp.odd;
|
||||
float4 ud = (float4)(1.0) - u;
|
||||
float4 vd = (float4)(1.0) - v;
|
||||
|
||||
|
||||
//float8 map1_dataU = map1_dataD + 1;
|
||||
|
||||
int4 map1_dataDx = map1_dataD.even;
|
||||
@@ -978,7 +978,7 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
|
||||
int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
|
||||
int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
|
||||
int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
|
||||
|
||||
|
||||
uchar16 val = (uchar16)(nval, nval, nval, nval);
|
||||
a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
|
||||
b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
|
||||
@@ -991,10 +991,10 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
|
||||
float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));
|
||||
|
||||
uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
|
||||
|
||||
|
||||
__global uchar16* D = (__global uchar16 *)(dst + dstStart);
|
||||
|
||||
uchar16 dVal = *D;
|
||||
uchar16 dVal = *D;
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
|
||||
|
||||
@@ -1039,7 +1039,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig
|
||||
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
|
||||
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
|
||||
|
||||
uchar16 dVal = *d;
|
||||
uchar16 dVal = *d;
|
||||
|
||||
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
|
||||
@@ -1059,13 +1059,13 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 4;
|
||||
x = x << 4;
|
||||
int gx = x - (dst_offset&15);
|
||||
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
|
||||
|
||||
float4 nval =convert_float4(nVal);
|
||||
float4 val = (float4)(nval.s0);
|
||||
|
||||
|
||||
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
|
||||
int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
|
||||
float8 map1_data;
|
||||
@@ -1087,7 +1087,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
|
||||
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
|
||||
int4 src_StartD = src_StartU + src_step;
|
||||
/*
|
||||
/*
|
||||
//not using the vload
|
||||
int4 src_StartU1 = src_StartU + (int4)(1);
|
||||
int4 src_StartD1 = src_StartD + (int4)(1);
|
||||
@@ -1097,7 +1097,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
a.y = *(src_StartU.y + src);
|
||||
a.z = *(src_StartU.z + src);
|
||||
a.w = *(src_StartU.w + src);
|
||||
|
||||
|
||||
b.x = *(src_StartU1.x + src);
|
||||
b.y = *(src_StartU1.y + src);
|
||||
b.z = *(src_StartU1.z + src);
|
||||
@@ -1129,7 +1129,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
b = (float4)(aU.y, bU.y, cU.y, dU.y);
|
||||
c = (float4)(aD.x, bD.x, cD.x, dD.x);
|
||||
d = (float4)(aD.y, bD.y, cD.y, dD.y);
|
||||
|
||||
|
||||
int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
|
||||
int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
|
||||
int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
|
||||
@@ -1140,10 +1140,10 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
|
||||
d = (convert_float4(dc) == (float4)(0))? d : val;
|
||||
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
|
||||
|
||||
|
||||
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
|
||||
|
||||
float4 dVal = *D;
|
||||
float4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
|
||||
@@ -1160,13 +1160,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
int y = get_global_id(1);
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
x = x << 4;
|
||||
x = x << 4;
|
||||
int gx = x - (dst_offset&15);
|
||||
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
|
||||
|
||||
float4 nval =convert_float4(nVal);
|
||||
float4 val = (float4)(nval.s0);
|
||||
|
||||
|
||||
int dstStart = y * dst_step + x + dst_offset - (dst_offset & 15);
|
||||
int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
|
||||
float4 map1_data;
|
||||
@@ -1191,7 +1191,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
|
||||
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
|
||||
int4 src_StartD = src_StartU + src_step;
|
||||
/*
|
||||
/*
|
||||
//not using the vload
|
||||
int4 src_StartU1 = src_StartU + (int4)(1);
|
||||
int4 src_StartD1 = src_StartD + (int4)(1);
|
||||
@@ -1201,7 +1201,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
a.y = *(src_StartU.y + src);
|
||||
a.z = *(src_StartU.z + src);
|
||||
a.w = *(src_StartU.w + src);
|
||||
|
||||
|
||||
b.x = *(src_StartU1.x + src);
|
||||
b.y = *(src_StartU1.y + src);
|
||||
b.z = *(src_StartU1.z + src);
|
||||
@@ -1233,7 +1233,7 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
b = (float4)(aU.y, bU.y, cU.y, dU.y);
|
||||
c = (float4)(aD.x, bD.x, cD.x, dD.x);
|
||||
d = (float4)(aD.y, bD.y, cD.y, dD.y);
|
||||
|
||||
|
||||
int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
|
||||
int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
|
||||
int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
|
||||
@@ -1244,10 +1244,10 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
|
||||
d = (convert_float4(dc) == (float4)(0))? d : val;
|
||||
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
|
||||
|
||||
|
||||
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
|
||||
|
||||
float4 dVal = *D;
|
||||
float4 dVal = *D;
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
|
||||
@@ -1261,9 +1261,9 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < threadCols && y < dst_rows)
|
||||
{
|
||||
{
|
||||
x = x << 4;
|
||||
|
||||
int gx = x - (dst_offset&15);
|
||||
@@ -1278,25 +1278,25 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
|
||||
short8 map1_data;
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
|
||||
|
||||
int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) << (int4)(2)) + src_offset;
|
||||
|
||||
|
||||
float4 src_data;
|
||||
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
|
||||
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
|
||||
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
|
||||
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
|
||||
float4 dst_data;
|
||||
|
||||
|
||||
dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
|
||||
dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
|
||||
dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
|
||||
dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
|
||||
|
||||
|
||||
|
||||
|
||||
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
|
||||
|
||||
float4 dVal = *d;
|
||||
float4 dVal = *d;
|
||||
|
||||
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
|
||||
@@ -1348,7 +1348,7 @@ __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const
|
||||
c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
|
||||
d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;
|
||||
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
|
||||
*((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
|
||||
|
||||
}
|
||||
@@ -1395,7 +1395,7 @@ __kernel void remapLNF1Constant_C4_D5(__global float * dst, __global float const
|
||||
c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
|
||||
d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;
|
||||
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
|
||||
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
|
||||
*((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
|
||||
|
||||
}
|
||||
@@ -1430,8 +1430,8 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im
|
||||
short8 map1_data;
|
||||
|
||||
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
|
||||
|
||||
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
|
||||
|
||||
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
|
||||
CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
|
||||
int4 src_data;
|
||||
@@ -1448,7 +1448,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im
|
||||
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
|
||||
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
|
||||
|
||||
*d = dst_data;
|
||||
*d = dst_data;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
@@ -44,14 +44,14 @@
|
||||
//M*/
|
||||
|
||||
|
||||
// resize kernel
|
||||
// resize kernel
|
||||
// Currently, CV_8UC1 CV_8UC4 CV_32FC1 and CV_32FC4are supported.
|
||||
// We shall support other types later if necessary.
|
||||
|
||||
#if defined DOUBLE_SUPPORT
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#define F double
|
||||
#else
|
||||
#else
|
||||
#define F float
|
||||
#endif
|
||||
|
||||
@@ -63,12 +63,12 @@
|
||||
#define INC(x,l) ((x+1) >= (l) ? (x):((x)+1))
|
||||
|
||||
__kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restrict src,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
|
||||
{
|
||||
int gx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
float4 sx, u, xf;
|
||||
int4 x, DX;
|
||||
gx = (gx<<2) - (dstoffset_in_pixel&3);
|
||||
@@ -80,15 +80,15 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
|
||||
float sy = ((dy+0.5f) * ify - 0.5f);
|
||||
int y = floor(sy);
|
||||
float v = sy - y;
|
||||
|
||||
|
||||
u = x < 0 ? 0 : u;
|
||||
u = (x >= src_cols) ? 0 : u;
|
||||
x = x < 0 ? 0 : x;
|
||||
x = (x >= src_cols) ? src_cols-1 : x;
|
||||
|
||||
|
||||
y<0 ? y=0,v=0 : y;
|
||||
y>=src_rows ? y=src_rows-1,v=0 : y;
|
||||
|
||||
|
||||
int4 U, U1;
|
||||
int V, V1;
|
||||
float4 utmp1, utmp2;
|
||||
@@ -96,8 +96,8 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
|
||||
float4 scale_vec = INTER_RESIZE_COEF_SCALE;
|
||||
utmp1 = u * scale_vec;
|
||||
utmp2 = scale_vec - utmp1;
|
||||
U = convert_int4(rint(utmp1));
|
||||
U1 = convert_int4(rint(utmp2));
|
||||
U = convert_int4(rint(utmp1));
|
||||
U1 = convert_int4(rint(utmp2));
|
||||
vtmp = v * INTER_RESIZE_COEF_SCALE;
|
||||
V = rint(vtmp);
|
||||
V1= rint(INTER_RESIZE_COEF_SCALE - vtmp);
|
||||
@@ -137,42 +137,42 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
|
||||
val1 = mul24(U1 , sdata1) + mul24(U , sdata2);
|
||||
val2 = mul24(U1 , sdata3) + mul24(U , sdata4);
|
||||
val = mul24((int4)V1 , val1) + mul24((int4)V , val2);
|
||||
|
||||
|
||||
val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS);
|
||||
|
||||
pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
|
||||
pos4.y++;
|
||||
pos4.z+=2;
|
||||
pos4.w+=3;
|
||||
uchar4 uval = convert_uchar4_sat(val);
|
||||
pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
|
||||
pos4.y++;
|
||||
pos4.z+=2;
|
||||
pos4.w+=3;
|
||||
uchar4 uval = convert_uchar4_sat(val);
|
||||
int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0);
|
||||
if(con)
|
||||
{
|
||||
*(__global uchar4*)(dst + pos4.x)=uval;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos4.x]=uval.x;
|
||||
}
|
||||
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos4.y]=uval.y;
|
||||
}
|
||||
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos4.z]=uval.z;
|
||||
}
|
||||
if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos4.w]=uval.w;
|
||||
}
|
||||
}
|
||||
if(con)
|
||||
{
|
||||
*(__global uchar4*)(dst + pos4.x)=uval;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos4.x]=uval.x;
|
||||
}
|
||||
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos4.y]=uval.y;
|
||||
}
|
||||
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos4.z]=uval.z;
|
||||
}
|
||||
if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos4.w]=uval.w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
@@ -186,10 +186,10 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
|
||||
x>=src_cols ? x=src_cols-1,u=0 : x,u;
|
||||
y<0 ? y=0,v=0 : y,v;
|
||||
y>=src_rows ? y=src_rows-1,v=0 : y,v;
|
||||
|
||||
|
||||
u = u * INTER_RESIZE_COEF_SCALE;
|
||||
v = v * INTER_RESIZE_COEF_SCALE;
|
||||
|
||||
|
||||
int U = rint(u);
|
||||
int V = rint(v);
|
||||
int U1= rint(INTER_RESIZE_COEF_SCALE - u);
|
||||
@@ -197,25 +197,25 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
|
||||
|
||||
int y_ = INC(y,src_rows);
|
||||
int x_ = INC(x,src_cols);
|
||||
int4 srcpos;
|
||||
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
int4 srcpos;
|
||||
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
int4 data0 = convert_int4(src[srcpos.x]);
|
||||
int4 data1 = convert_int4(src[srcpos.y]);
|
||||
int4 data2 = convert_int4(src[srcpos.z]);
|
||||
int4 data3 = convert_int4(src[srcpos.w]);
|
||||
int4 val = mul24((int4)mul24(U1, V1) , data0) + mul24((int4)mul24(U, V1) , data1)
|
||||
+mul24((int4)mul24(U1, V) , data2)+mul24((int4)mul24(U, V) , data3);
|
||||
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
|
||||
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
|
||||
uchar4 uval = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
|
||||
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
|
||||
dst[dstpos] = uval;
|
||||
}
|
||||
|
||||
__kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
@@ -229,16 +229,16 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
|
||||
x>=src_cols ? x=src_cols-1,u=0 : x,u;
|
||||
y<0 ? y=0,v=0 : y,v;
|
||||
y>=src_rows ? y=src_rows-1,v=0 : y,v;
|
||||
|
||||
|
||||
int y_ = INC(y,src_rows);
|
||||
int x_ = INC(x,src_cols);
|
||||
float u1 = 1.f-u;
|
||||
float v1 = 1.f-v;
|
||||
int4 srcpos;
|
||||
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
float u1 = 1.f-u;
|
||||
float v1 = 1.f-v;
|
||||
int4 srcpos;
|
||||
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
float data0 = src[srcpos.x];
|
||||
float data1 = src[srcpos.y];
|
||||
float data2 = src[srcpos.z];
|
||||
@@ -248,13 +248,13 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
|
||||
float val2 = u1 * data2 +
|
||||
u * data3;
|
||||
float val = v1 * val1 + v * val2;
|
||||
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
|
||||
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
|
||||
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
|
||||
dst[dstpos] = val;
|
||||
dst[dstpos] = val;
|
||||
}
|
||||
|
||||
__kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
@@ -268,43 +268,43 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
|
||||
x>=src_cols ? x=src_cols-1,u=0 : x;
|
||||
y<0 ? y=0,v=0 : y;
|
||||
y>=src_rows ? y=src_rows-1,v=0 : y;
|
||||
|
||||
|
||||
int y_ = INC(y,src_rows);
|
||||
int x_ = INC(x,src_cols);
|
||||
float u1 = 1.f-u;
|
||||
float v1 = 1.f-v;
|
||||
int4 srcpos;
|
||||
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
float u1 = 1.f-u;
|
||||
float v1 = 1.f-v;
|
||||
int4 srcpos;
|
||||
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
|
||||
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
|
||||
float4 s_data1, s_data2, s_data3, s_data4;
|
||||
s_data1 = src[srcpos.x];
|
||||
s_data2 = src[srcpos.y];
|
||||
s_data3 = src[srcpos.z];
|
||||
s_data4 = src[srcpos.w];
|
||||
float4 val = u1 * v1 * s_data1 + u * v1 * s_data2
|
||||
+u1 * v *s_data3 + u * v *s_data4;
|
||||
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
|
||||
+u1 * v *s_data3 + u * v *s_data4;
|
||||
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
|
||||
|
||||
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
|
||||
dst[dstpos] = val;
|
||||
dst[dstpos] = val;
|
||||
}
|
||||
|
||||
__kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
|
||||
{
|
||||
int gx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
gx = (gx<<2) - (dstoffset_in_pixel&3);
|
||||
//int4 GX = (int4)(gx, gx+1, gx+2, gx+3);
|
||||
|
||||
|
||||
int4 sx;
|
||||
int sy;
|
||||
F ss1 = gx*ifx;
|
||||
F ss2 = (gx+1)*ifx;
|
||||
F ss2 = (gx+1)*ifx;
|
||||
F ss3 = (gx+2)*ifx;
|
||||
F ss4 = (gx+3)*ifx;
|
||||
F s5 = dy * ify;
|
||||
@@ -313,87 +313,87 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
|
||||
sx.s2 = min((int)floor(ss3), src_cols-1);
|
||||
sx.s3 = min((int)floor(ss4), src_cols-1);
|
||||
sy = min((int)floor(s5), src_rows-1);
|
||||
|
||||
|
||||
uchar4 val;
|
||||
int4 pos = mad24((int4)sy, (int4)srcstep_in_pixel, sx+(int4)srcoffset_in_pixel);
|
||||
val.s0 = src[pos.s0];
|
||||
val.s1 = src[pos.s1];
|
||||
val.s2 = src[pos.s2];
|
||||
val.s3 = src[pos.s3];
|
||||
|
||||
|
||||
//__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx);
|
||||
//uchar4 dVal = *d;
|
||||
pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
|
||||
pos.y++;
|
||||
pos.z+=2;
|
||||
pos.w+=3;
|
||||
pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
|
||||
pos.y++;
|
||||
pos.z+=2;
|
||||
pos.w+=3;
|
||||
|
||||
int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0);
|
||||
if(con)
|
||||
{
|
||||
*(__global uchar4*)(dst + pos.x)=val;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos.x]=val.x;
|
||||
}
|
||||
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos.y]=val.y;
|
||||
}
|
||||
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos.z]=val.z;
|
||||
}
|
||||
if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos.w]=val.w;
|
||||
}
|
||||
}
|
||||
if(con)
|
||||
{
|
||||
*(__global uchar4*)(dst + pos.x)=val;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos.x]=val.x;
|
||||
}
|
||||
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos.y]=val.y;
|
||||
}
|
||||
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos.z]=val.z;
|
||||
}
|
||||
if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
dst[pos.w]=val.w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
F s1 = dx*ifx;
|
||||
F s2 = dy*ify;
|
||||
int sx = fmin((float)floor(s1), (float)src_cols-1);
|
||||
int sy = fmin((float)floor(s2), (float)src_rows-1);
|
||||
int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
|
||||
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
|
||||
|
||||
|
||||
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
|
||||
dst[dpos] = src[spos];
|
||||
|
||||
|
||||
}
|
||||
|
||||
__kernel void resizeNN_C1_D5(__global float * dst, __global float * src,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
F s1 = dx*ifx;
|
||||
F s2 = dy*ify;
|
||||
int sx = fmin((float)floor(s1), (float)src_cols-1);
|
||||
int sy = fmin((float)floor(s2), (float)src_rows-1);
|
||||
|
||||
int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
|
||||
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
|
||||
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
|
||||
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
|
||||
dst[dpos] = src[spos];
|
||||
|
||||
|
||||
}
|
||||
|
||||
__kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
@@ -406,9 +406,9 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
|
||||
int sy = min(s_row, src_rows-1);
|
||||
int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
|
||||
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
|
||||
|
||||
|
||||
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
|
||||
dst[dpos] = src[spos];
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@
|
||||
// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
|
||||
// THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
|
||||
|
||||
__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
|
||||
__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
|
||||
int src_offset, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step,
|
||||
uchar thresh, uchar max_val, int thresh_type
|
||||
@@ -60,15 +60,15 @@ __kernel void threshold_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
int gx = get_global_id(0);
|
||||
const int gy = get_global_id(1);
|
||||
|
||||
int offset = (dst_offset & 15);
|
||||
src_offset -= offset;
|
||||
|
||||
int dstart = (gx << 4) - offset;
|
||||
int offset = (dst_offset & 15);
|
||||
src_offset -= offset;
|
||||
|
||||
int dstart = (gx << 4) - offset;
|
||||
if(dstart < dst_cols && gy < dst_rows)
|
||||
{
|
||||
uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
|
||||
uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
|
||||
uchar16 ddata;
|
||||
uchar16 zero = 0;
|
||||
uchar16 zero = 0;
|
||||
switch (thresh_type)
|
||||
{
|
||||
case 0:
|
||||
@@ -89,20 +89,20 @@ __kernel void threshold_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
default:
|
||||
ddata = sdata;
|
||||
}
|
||||
int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
|
||||
dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
|
||||
uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
|
||||
int16 con = dpos >= 0 && dpos < dst_cols;
|
||||
ddata = convert_uchar16(con != 0) ? ddata : dVal;
|
||||
if(dstart < dst_cols)
|
||||
{
|
||||
*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
|
||||
}
|
||||
int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
|
||||
dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
|
||||
uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
|
||||
int16 con = dpos >= 0 && dpos < dst_cols;
|
||||
ddata = convert_uchar16(con != 0) ? ddata : dVal;
|
||||
if(dstart < dst_cols)
|
||||
{
|
||||
*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
|
||||
__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
|
||||
int src_offset, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step,
|
||||
float thresh, float max_val, int thresh_type
|
||||
@@ -110,16 +110,16 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
|
||||
{
|
||||
const int gx = get_global_id(0);
|
||||
const int gy = get_global_id(1);
|
||||
|
||||
int offset = (dst_offset & 3);
|
||||
src_offset -= offset;
|
||||
|
||||
int dstart = (gx << 2) - offset;
|
||||
|
||||
int offset = (dst_offset & 3);
|
||||
src_offset -= offset;
|
||||
|
||||
int dstart = (gx << 2) - offset;
|
||||
if(dstart < dst_cols && gy < dst_rows)
|
||||
{
|
||||
float4 sdata = vload4(gx, src+src_offset+gy*src_step);
|
||||
float4 ddata;
|
||||
float4 zero = 0;
|
||||
float4 zero = 0;
|
||||
switch (thresh_type)
|
||||
{
|
||||
case 0:
|
||||
@@ -140,14 +140,14 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
|
||||
default:
|
||||
ddata = sdata;
|
||||
}
|
||||
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
|
||||
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
|
||||
int4 con = dpos >= 0 && dpos < dst_cols;
|
||||
ddata = convert_float4(con) != 0 ? ddata : dVal;
|
||||
if(dstart < dst_cols)
|
||||
{
|
||||
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
|
||||
}
|
||||
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
|
||||
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
|
||||
int4 con = dpos >= 0 && dpos < dst_cols;
|
||||
ddata = convert_float4(con) != 0 ? ddata : dVal;
|
||||
if(dstart < dst_cols)
|
||||
{
|
||||
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
typedef double F;
|
||||
typedef double4 F4;
|
||||
#define convert_F4 convert_double4
|
||||
#else
|
||||
#else
|
||||
typedef float F;
|
||||
typedef float4 F4;
|
||||
#define convert_F4 convert_float4
|
||||
@@ -61,9 +61,9 @@ typedef float4 F4;
|
||||
|
||||
#define INTER_BITS 5
|
||||
#define INTER_TAB_SIZE (1 << INTER_BITS)
|
||||
#define INTER_SCALE 1.f/INTER_TAB_SIZE
|
||||
#define AB_BITS max(10, (int)INTER_BITS)
|
||||
#define AB_SCALE (1 << AB_BITS)
|
||||
#define INTER_SCALE 1.f/INTER_TAB_SIZE
|
||||
#define AB_BITS max(10, (int)INTER_BITS)
|
||||
#define AB_SCALE (1 << AB_BITS)
|
||||
#define INTER_REMAP_COEF_BITS 15
|
||||
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
|
||||
|
||||
@@ -81,7 +81,7 @@ inline void interpolateCubic( float x, float* coeffs )
|
||||
/**********************************************8UC1*********************************************
|
||||
***********************************************************************************************/
|
||||
__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
@@ -90,9 +90,9 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
dx = (dx<<2) - (dst_offset&3);
|
||||
|
||||
|
||||
int round_delta = (AB_SCALE>>1);
|
||||
|
||||
|
||||
int4 X, Y;
|
||||
int4 sx, sy;
|
||||
int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
|
||||
@@ -105,13 +105,13 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
|
||||
int tmp1, tmp2;
|
||||
tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
|
||||
tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
|
||||
|
||||
|
||||
X += tmp1 + round_delta;
|
||||
Y += tmp2 + round_delta;
|
||||
|
||||
|
||||
sx = convert_int4(convert_short4(X >> AB_BITS));
|
||||
sy = convert_int4(convert_short4(Y >> AB_BITS));
|
||||
|
||||
|
||||
__global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
|
||||
uchar4 dval = *d;
|
||||
DX = (int4)(dx, dx+1, dx+2, dx+3);
|
||||
@@ -129,7 +129,7 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u
|
||||
}
|
||||
|
||||
__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
@@ -139,9 +139,9 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
dx = (dx<<2) - (dst_offset&3);
|
||||
|
||||
|
||||
int round_delta = ((AB_SCALE >> INTER_BITS) >> 1);
|
||||
|
||||
|
||||
int4 X, Y;
|
||||
short4 ax, ay;
|
||||
int4 sx, sy;
|
||||
@@ -152,22 +152,22 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
|
||||
M3DX = M[3] * convert_F4(DX);
|
||||
X = convert_int4(rint(M0DX));
|
||||
Y = convert_int4(rint(M3DX));
|
||||
|
||||
|
||||
int tmp1, tmp2;
|
||||
tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
|
||||
tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
|
||||
|
||||
|
||||
X += tmp1 + round_delta;
|
||||
Y += tmp2 + round_delta;
|
||||
|
||||
|
||||
X = X >> (AB_BITS - INTER_BITS);
|
||||
Y = Y >> (AB_BITS - INTER_BITS);
|
||||
|
||||
|
||||
sx = convert_int4(convert_short4(X >> INTER_BITS));
|
||||
sy = convert_int4(convert_short4(Y >> INTER_BITS));
|
||||
ax = convert_short4(X & (INTER_TAB_SIZE-1));
|
||||
ay = convert_short4(Y & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
uchar4 v0, v1, v2,v3;
|
||||
int4 scon0, scon1, scon2, scon3;
|
||||
int4 spos0, spos1, spos2, spos3;
|
||||
@@ -200,12 +200,12 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
|
||||
v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
|
||||
v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
|
||||
v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
|
||||
|
||||
|
||||
short4 itab0, itab1, itab2, itab3;
|
||||
float4 taby, tabx;
|
||||
taby = INTER_SCALE * convert_float4(ay);
|
||||
tabx = INTER_SCALE * convert_float4(ax);
|
||||
|
||||
|
||||
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
||||
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
|
||||
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
||||
@@ -214,30 +214,30 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob
|
||||
|
||||
int4 val;
|
||||
uchar4 tval;
|
||||
val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
|
||||
val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
|
||||
+ convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
|
||||
tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
|
||||
|
||||
|
||||
__global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
|
||||
uchar4 dval = *d;
|
||||
DX = (int4)(dx, dx+1, dx+2, dx+3);
|
||||
int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
|
||||
dval = convert_uchar4(dcon != 0) ? tval : dval;
|
||||
*d = dval;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
|
||||
|
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE);
|
||||
int Y0 = rint(M[3] * dx * AB_SCALE);
|
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
|
||||
@@ -249,10 +249,10 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
|
||||
short sy = (short)(Y >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
uchar v[16];
|
||||
int i, j;
|
||||
|
||||
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
for(j=0; j<4; j++)
|
||||
@@ -269,14 +269,14 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
|
||||
interpolateCubic(ayy, tab1y);
|
||||
interpolateCubic(axx, tab1x);
|
||||
int isum = 0;
|
||||
|
||||
|
||||
#pragma unroll 16
|
||||
for( i=0; i<16; i++ )
|
||||
{
|
||||
F v = tab1y[(i>>2)] * tab1x[(i&3)];
|
||||
isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
|
||||
}
|
||||
|
||||
|
||||
if( isum != INTER_REMAP_COEF_SCALE )
|
||||
{
|
||||
int k1, k2;
|
||||
@@ -309,16 +309,16 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst,
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = (AB_SCALE >> 1);
|
||||
|
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE);
|
||||
int Y0 = rint(M[3] * dx * AB_SCALE);
|
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
|
||||
@@ -326,26 +326,26 @@ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global
|
||||
|
||||
int sx0 = (short)(X0 >> AB_BITS);
|
||||
int sy0 = (short)(Y0 >> AB_BITS);
|
||||
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
|
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
|
||||
|
||||
|
||||
src_offset = (src_offset>>2);
|
||||
srcStep = (srcStep>>2);
|
||||
srcStep = (srcStep>>2);
|
||||
|
||||
int tmp = (dx << AB_BITS);
|
||||
int X0 = rint(M[0] * tmp);
|
||||
@@ -359,7 +359,7 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
|
||||
short sy0 = (short)(Y0 >> INTER_BITS);
|
||||
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
|
||||
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
int4 v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0;
|
||||
@@ -371,36 +371,36 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
|
||||
float taby, tabx;
|
||||
taby = 1.f/INTER_TAB_SIZE*ay0;
|
||||
tabx = 1.f/INTER_TAB_SIZE*ax0;
|
||||
|
||||
|
||||
itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
||||
itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
|
||||
itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
||||
itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
|
||||
|
||||
|
||||
int4 val;
|
||||
val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
|
||||
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
|
||||
|
||||
|
||||
src_offset = (src_offset>>2);
|
||||
srcStep = (srcStep>>2);
|
||||
srcStep = (srcStep>>2);
|
||||
dst_offset = (dst_offset>>2);
|
||||
dstStep = (dstStep>>2);
|
||||
|
||||
dstStep = (dstStep>>2);
|
||||
|
||||
int tmp = (dx << AB_BITS);
|
||||
int X0 = rint(M[0] * tmp);
|
||||
int Y0 = rint(M[3] * tmp);
|
||||
@@ -413,7 +413,7 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
|
||||
int sy = (short)(Y0 >> INTER_BITS) - 1;
|
||||
int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
|
||||
int ax = (short)(X0 & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
uchar4 v[16];
|
||||
int i,j;
|
||||
#pragma unroll 4
|
||||
@@ -431,7 +431,7 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
|
||||
interpolateCubic(ayy, tab1y);
|
||||
interpolateCubic(axx, tab1x);
|
||||
int isum = 0;
|
||||
|
||||
|
||||
#pragma unroll 16
|
||||
for( i=0; i<16; i++ )
|
||||
{
|
||||
@@ -446,17 +446,17 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
|
||||
int k1, k2;
|
||||
int diff = isum - INTER_REMAP_COEF_SCALE;
|
||||
int Mk1=2, Mk2=2, mk1=2, mk2=2;
|
||||
|
||||
|
||||
for( k1 = 2; k1 < 4; k1++ )
|
||||
for( k2 = 2; k2 < 4; k2++ )
|
||||
{
|
||||
|
||||
|
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
|
||||
mk1 = k1, mk2 = k2;
|
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
}
|
||||
|
||||
|
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
|
||||
}
|
||||
|
||||
@@ -477,16 +477,16 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = AB_SCALE/2;
|
||||
|
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE);
|
||||
int Y0 = rint(M[3] * dx * AB_SCALE);
|
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
|
||||
@@ -494,25 +494,25 @@ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int
|
||||
|
||||
short sx0 = (short)(X0 >> AB_BITS);
|
||||
short sy0 = (short)(Y0 >> AB_BITS);
|
||||
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
|
||||
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
|
||||
|
||||
|
||||
src_offset = (src_offset>>2);
|
||||
|
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE);
|
||||
int Y0 = rint(M[3] * dx * AB_SCALE);
|
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
|
||||
@@ -524,7 +524,7 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
|
||||
short sy0 = (short)(Y0 >> INTER_BITS);
|
||||
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
|
||||
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
float v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
|
||||
@@ -538,33 +538,33 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
|
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
|
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
|
||||
|
||||
|
||||
tab[0] = taby[0] * tabx[0];
|
||||
tab[1] = taby[0] * tabx[1];
|
||||
tab[2] = taby[1] * tabx[0];
|
||||
tab[3] = taby[1] * tabx[1];
|
||||
|
||||
float sum = 0;
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
|
||||
|
||||
|
||||
src_offset = (src_offset>>2);
|
||||
dst_offset = (dst_offset>>2);
|
||||
|
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE);
|
||||
int Y0 = rint(M[3] * dx * AB_SCALE);
|
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
|
||||
@@ -576,7 +576,7 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
|
||||
short sy = (short)(Y0 >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X0 & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
float v[16];
|
||||
int i;
|
||||
|
||||
@@ -597,7 +597,7 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
|
||||
{
|
||||
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
|
||||
}
|
||||
|
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
float sum = 0;
|
||||
@@ -617,16 +617,16 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst,
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = AB_SCALE/2;
|
||||
|
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE);
|
||||
int Y0 = rint(M[3] * dx * AB_SCALE);
|
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
|
||||
@@ -634,28 +634,28 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i
|
||||
|
||||
short sx0 = (short)(X0 >> AB_BITS);
|
||||
short sy0 = (short)(Y0 >> AB_BITS);
|
||||
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0;
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
|
||||
|
||||
|
||||
src_offset = (src_offset>>4);
|
||||
dst_offset = (dst_offset>>4);
|
||||
srcStep = (srcStep>>2);
|
||||
dstStep = (dstStep>>2);
|
||||
|
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE);
|
||||
int Y0 = rint(M[3] * dx * AB_SCALE);
|
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
|
||||
@@ -667,7 +667,7 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
|
||||
short sy0 = (short)(Y0 >> INTER_BITS);
|
||||
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
|
||||
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
float4 v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
|
||||
@@ -681,35 +681,35 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
|
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
|
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
|
||||
|
||||
|
||||
tab[0] = taby[0] * tabx[0];
|
||||
tab[1] = taby[0] * tabx[1];
|
||||
tab[2] = taby[1] * tabx[0];
|
||||
tab[3] = taby[1] * tabx[1];
|
||||
|
||||
float4 sum = 0;
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[dst_offset+dy*dstStep+dx] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
|
||||
|
||||
|
||||
src_offset = (src_offset>>4);
|
||||
dst_offset = (dst_offset>>4);
|
||||
srcStep = (srcStep>>2);
|
||||
dstStep = (dstStep>>2);
|
||||
|
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE);
|
||||
int Y0 = rint(M[3] * dx * AB_SCALE);
|
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
|
||||
@@ -721,7 +721,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
|
||||
short sy = (short)(Y0 >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X0 & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
float4 v[16];
|
||||
int i;
|
||||
|
||||
@@ -742,7 +742,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst
|
||||
{
|
||||
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
|
||||
}
|
||||
|
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
float4 sum = 0;
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
typedef double F;
|
||||
typedef double4 F4;
|
||||
#define convert_F4 convert_double4
|
||||
#else
|
||||
#else
|
||||
typedef float F;
|
||||
typedef float4 F4;
|
||||
#define convert_F4 convert_float4
|
||||
@@ -61,9 +61,9 @@ typedef float4 F4;
|
||||
|
||||
#define INTER_BITS 5
|
||||
#define INTER_TAB_SIZE (1 << INTER_BITS)
|
||||
#define INTER_SCALE 1.f/INTER_TAB_SIZE
|
||||
#define AB_BITS max(10, (int)INTER_BITS)
|
||||
#define AB_SCALE (1 << AB_BITS)
|
||||
#define INTER_SCALE 1.f/INTER_TAB_SIZE
|
||||
#define AB_BITS max(10, (int)INTER_BITS)
|
||||
#define AB_SCALE (1 << AB_BITS)
|
||||
#define INTER_REMAP_COEF_BITS 15
|
||||
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
|
||||
|
||||
@@ -81,7 +81,7 @@ inline void interpolateCubic( float x, float* coeffs )
|
||||
/**********************************************8UC1*********************************************
|
||||
***********************************************************************************************/
|
||||
__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
@@ -90,7 +90,7 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
dx = (dx<<2) - (dst_offset&3);
|
||||
|
||||
|
||||
F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
|
||||
F4 X0 = M[0]*DX + M[1]*dy + M[2];
|
||||
F4 Y0 = M[3]*DX + M[4]*dy + M[5];
|
||||
@@ -118,12 +118,12 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
@@ -132,12 +132,12 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
|
||||
int sx = (short)(X >> INTER_BITS);
|
||||
int sy = (short)(Y >> INTER_BITS);
|
||||
int ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
int ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
uchar v[4];
|
||||
int i;
|
||||
#pragma unroll 4
|
||||
@@ -150,7 +150,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
|
||||
tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
|
||||
tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
|
||||
tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
|
||||
|
||||
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
{
|
||||
@@ -170,12 +170,12 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
@@ -184,15 +184,15 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1;
|
||||
short sy = (short)(Y >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
uchar v[16];
|
||||
int i, j;
|
||||
|
||||
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
for(j=0; j<4; j++)
|
||||
@@ -208,7 +208,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
|
||||
axx = 1.f/INTER_TAB_SIZE * ax;
|
||||
interpolateCubic(ayy, tab1y);
|
||||
interpolateCubic(axx, tab1x);
|
||||
|
||||
|
||||
int isum = 0;
|
||||
#pragma unroll 16
|
||||
for( i=0; i<16; i++ )
|
||||
@@ -249,12 +249,12 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
|
||||
@@ -266,37 +266,37 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl
|
||||
int Y = rint(Y0*W);
|
||||
short sx = (short)X;
|
||||
short sy = (short)Y;
|
||||
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
|
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
src_offset = (src_offset>>2);
|
||||
srcStep = (srcStep>>2);
|
||||
|
||||
srcStep = (srcStep>>2);
|
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
|
||||
short sx = (short)(X >> INTER_BITS);
|
||||
short sy = (short)(Y >> INTER_BITS);
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
|
||||
|
||||
int4 v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0;
|
||||
@@ -308,46 +308,46 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
|
||||
float taby, tabx;
|
||||
taby = 1.f/INTER_TAB_SIZE*ay;
|
||||
tabx = 1.f/INTER_TAB_SIZE*ax;
|
||||
|
||||
|
||||
itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
||||
itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
|
||||
itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
|
||||
itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
|
||||
|
||||
|
||||
int4 val;
|
||||
val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
|
||||
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
src_offset = (src_offset>>2);
|
||||
srcStep = (srcStep>>2);
|
||||
srcStep = (srcStep>>2);
|
||||
dst_offset = (dst_offset>>2);
|
||||
dstStep = (dstStep>>2);
|
||||
|
||||
dstStep = (dstStep>>2);
|
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1;
|
||||
short sy = (short)(Y >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
uchar4 v[16];
|
||||
int i,j;
|
||||
#pragma unroll 4
|
||||
@@ -365,7 +365,7 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
|
||||
interpolateCubic(ayy, tab1y);
|
||||
interpolateCubic(axx, tab1x);
|
||||
int isum = 0;
|
||||
|
||||
|
||||
#pragma unroll 16
|
||||
for( i=0; i<16; i++ )
|
||||
{
|
||||
@@ -380,17 +380,17 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
|
||||
int k1, k2;
|
||||
int diff = isum - INTER_REMAP_COEF_SCALE;
|
||||
int Mk1=2, Mk2=2, mk1=2, mk2=2;
|
||||
|
||||
|
||||
for( k1 = 2; k1 < 4; k1++ )
|
||||
for( k2 = 2; k2 < 4; k2++ )
|
||||
{
|
||||
|
||||
|
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
|
||||
mk1 = k1, mk2 = k2;
|
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
|
||||
Mk1 = k1, Mk2 = k2;
|
||||
}
|
||||
|
||||
|
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
|
||||
}
|
||||
|
||||
@@ -411,12 +411,12 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
@@ -429,33 +429,33 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst
|
||||
short sy = (short)Y;
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
|
||||
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
src_offset = (src_offset>>2);
|
||||
|
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
|
||||
short sx = (short)(X >> INTER_BITS);
|
||||
short sy = (short)(Y >> INTER_BITS);
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
float v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0;
|
||||
@@ -469,38 +469,38 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
|
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay;
|
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
|
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax;
|
||||
|
||||
|
||||
tab[0] = taby[0] * tabx[0];
|
||||
tab[1] = taby[0] * tabx[1];
|
||||
tab[2] = taby[1] * tabx[0];
|
||||
tab[3] = taby[1] * tabx[1];
|
||||
|
||||
float sum = 0;
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
src_offset = (src_offset>>2);
|
||||
dst_offset = (dst_offset>>2);
|
||||
|
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1;
|
||||
short sy = (short)(Y >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
@@ -526,7 +526,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
|
||||
{
|
||||
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
|
||||
}
|
||||
|
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
float sum = 0;
|
||||
@@ -546,12 +546,12 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
|
||||
***********************************************************************************************/
|
||||
|
||||
__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
@@ -562,39 +562,39 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
|
||||
int Y = rint(Y0*W);
|
||||
short sx = (short)X;
|
||||
short sy = (short)Y;
|
||||
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0;
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int dst_cols, int dst_rows, int srcStep, int dstStep,
|
||||
int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows)
|
||||
{
|
||||
src_offset = (src_offset>>4);
|
||||
dst_offset = (dst_offset>>4);
|
||||
srcStep = (srcStep>>2);
|
||||
dstStep = (dstStep>>2);
|
||||
|
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
|
||||
short sx0 = (short)(X >> INTER_BITS);
|
||||
short sy0 = (short)(Y >> INTER_BITS);
|
||||
short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax0 = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
|
||||
|
||||
float4 v0, v1, v2, v3;
|
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
|
||||
@@ -608,46 +608,46 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
|
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
|
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
|
||||
|
||||
|
||||
tab[0] = taby[0] * tabx[0];
|
||||
tab[1] = taby[0] * tabx[1];
|
||||
tab[2] = taby[1] * tabx[0];
|
||||
tab[3] = taby[1] * tabx[1];
|
||||
|
||||
float4 sum = 0;
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[dst_offset+dy*dstStep+dx] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
|
||||
|
||||
__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows )
|
||||
{
|
||||
src_offset = (src_offset>>4);
|
||||
dst_offset = (dst_offset>>4);
|
||||
srcStep = (srcStep>>2);
|
||||
dstStep = (dstStep>>2);
|
||||
|
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
|
||||
short sx = (short)(X >> INTER_BITS)-1;
|
||||
short sy = (short)(Y >> INTER_BITS)-1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
||||
|
||||
|
||||
float4 v[16];
|
||||
int i;
|
||||
|
||||
@@ -668,7 +668,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
|
||||
{
|
||||
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
|
||||
}
|
||||
|
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
float4 sum = 0;
|
||||
|
||||
@@ -1,252 +1,252 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
// Image read mode
|
||||
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
|
||||
|
||||
// atomic add for 32bit floating point
|
||||
inline void atomic_addf(volatile __global float *source, const float operand) {
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} newVal;
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} prevVal;
|
||||
do {
|
||||
prevVal.floatVal = *source;
|
||||
newVal.floatVal = prevVal.floatVal + operand;
|
||||
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
|
||||
}
|
||||
|
||||
__kernel void memsetKernel(
|
||||
float val,
|
||||
__global float * image,
|
||||
int width,
|
||||
int height,
|
||||
int step, // in element
|
||||
int offset
|
||||
)
|
||||
{
|
||||
if(get_global_id(0) >= width || get_global_id(1) >= height)
|
||||
{
|
||||
return;
|
||||
}
|
||||
image += offset;
|
||||
image[get_global_id(0) + get_global_id(1) * step] = val;
|
||||
}
|
||||
|
||||
__kernel void normalizeKernel(
|
||||
__global float * buffer,
|
||||
int width,
|
||||
int height,
|
||||
int step,
|
||||
int f_offset,
|
||||
int d_offset
|
||||
)
|
||||
{
|
||||
__global float * factors = buffer + f_offset;
|
||||
__global float * dst = buffer + d_offset;
|
||||
|
||||
int j = get_global_id(0);
|
||||
int i = get_global_id(1);
|
||||
|
||||
if(j >= width || i >= height)
|
||||
{
|
||||
return;
|
||||
}
|
||||
float scale = factors[step * i + j];
|
||||
float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
|
||||
|
||||
dst[step * i + j] *= invScale;
|
||||
}
|
||||
|
||||
__kernel void forwardWarpKernel(
|
||||
__global const float * src,
|
||||
__global float * buffer,
|
||||
__global const float * u,
|
||||
__global const float * v,
|
||||
const int w,
|
||||
const int h,
|
||||
const int flow_stride,
|
||||
const int image_stride,
|
||||
const int factor_offset,
|
||||
const int dst_offset,
|
||||
const float time_scale
|
||||
)
|
||||
{
|
||||
int j = get_global_id(0);
|
||||
int i = get_global_id(1);
|
||||
|
||||
if (i >= h || j >= w) return;
|
||||
|
||||
volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
|
||||
volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
|
||||
|
||||
int flow_row_offset = i * flow_stride;
|
||||
int image_row_offset = i * image_stride;
|
||||
|
||||
//bottom left corner of a target pixel
|
||||
float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
|
||||
float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
|
||||
// pixel containing bottom left corner
|
||||
float px;
|
||||
float py;
|
||||
float dx = modf(cx, &px);
|
||||
float dy = modf(cy, &py);
|
||||
// target pixel integer coords
|
||||
int tx;
|
||||
int ty;
|
||||
tx = (int) px;
|
||||
ty = (int) py;
|
||||
float value = src[image_row_offset + j];
|
||||
float weight;
|
||||
// fill pixel containing bottom right corner
|
||||
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
|
||||
{
|
||||
weight = dx * dy;
|
||||
atomic_addf(dst + ty * image_stride + tx, value * weight);
|
||||
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
|
||||
}
|
||||
|
||||
// fill pixel containing bottom left corner
|
||||
tx -= 1;
|
||||
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
|
||||
{
|
||||
weight = (1.0f - dx) * dy;
|
||||
atomic_addf(dst + ty * image_stride + tx, value * weight);
|
||||
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
|
||||
}
|
||||
|
||||
// fill pixel containing upper left corner
|
||||
ty -= 1;
|
||||
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
|
||||
{
|
||||
weight = (1.0f - dx) * (1.0f - dy);
|
||||
atomic_addf(dst + ty * image_stride + tx, value * weight);
|
||||
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
|
||||
}
|
||||
|
||||
// fill pixel containing upper right corner
|
||||
tx += 1;
|
||||
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
|
||||
{
|
||||
weight = dx * (1.0f - dy);
|
||||
atomic_addf(dst + ty * image_stride + tx, value * weight);
|
||||
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
|
||||
}
|
||||
}
|
||||
|
||||
// define buffer offsets
|
||||
enum
|
||||
{
|
||||
O0_OS = 0,
|
||||
O1_OS,
|
||||
U_OS,
|
||||
V_OS,
|
||||
UR_OS,
|
||||
VR_OS
|
||||
};
|
||||
|
||||
__kernel void blendFramesKernel(
|
||||
image2d_t tex_src0,
|
||||
image2d_t tex_src1,
|
||||
__global float * buffer,
|
||||
__global float * out,
|
||||
int w,
|
||||
int h,
|
||||
int step,
|
||||
float theta
|
||||
)
|
||||
{
|
||||
__global float * u = buffer + h * step * U_OS;
|
||||
__global float * v = buffer + h * step * V_OS;
|
||||
__global float * ur = buffer + h * step * UR_OS;
|
||||
__global float * vr = buffer + h * step * VR_OS;
|
||||
__global float * o0 = buffer + h * step * O0_OS;
|
||||
__global float * o1 = buffer + h * step * O1_OS;
|
||||
|
||||
int ix = get_global_id(0);
|
||||
int iy = get_global_id(1);
|
||||
|
||||
if(ix >= w || iy >= h) return;
|
||||
|
||||
int pos = ix + step * iy;
|
||||
|
||||
float _u = u[pos];
|
||||
float _v = v[pos];
|
||||
|
||||
float _ur = ur[pos];
|
||||
float _vr = vr[pos];
|
||||
|
||||
float x = (float)ix + 0.5f;
|
||||
float y = (float)iy + 0.5f;
|
||||
bool b0 = o0[pos] > 1e-4f;
|
||||
bool b1 = o1[pos] > 1e-4f;
|
||||
|
||||
float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
|
||||
float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
|
||||
|
||||
if (b0 && b1)
|
||||
{
|
||||
// pixel is visible on both frames
|
||||
out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) +
|
||||
read_imagef(tex_src1, sampler, coord1).x * theta;
|
||||
}
|
||||
else if (b0)
|
||||
{
|
||||
// visible on the first frame only
|
||||
out[pos] = read_imagef(tex_src0, sampler, coord0).x;
|
||||
}
|
||||
else
|
||||
{
|
||||
// visible on the second frame only
|
||||
out[pos] = read_imagef(tex_src1, sampler, coord1).x;
|
||||
}
|
||||
}
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
// Image read mode
|
||||
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
|
||||
|
||||
// atomic add for 32bit floating point
|
||||
inline void atomic_addf(volatile __global float *source, const float operand) {
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} newVal;
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} prevVal;
|
||||
do {
|
||||
prevVal.floatVal = *source;
|
||||
newVal.floatVal = prevVal.floatVal + operand;
|
||||
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
|
||||
}
|
||||
|
||||
__kernel void memsetKernel(
|
||||
float val,
|
||||
__global float * image,
|
||||
int width,
|
||||
int height,
|
||||
int step, // in element
|
||||
int offset
|
||||
)
|
||||
{
|
||||
if(get_global_id(0) >= width || get_global_id(1) >= height)
|
||||
{
|
||||
return;
|
||||
}
|
||||
image += offset;
|
||||
image[get_global_id(0) + get_global_id(1) * step] = val;
|
||||
}
|
||||
|
||||
__kernel void normalizeKernel(
|
||||
__global float * buffer,
|
||||
int width,
|
||||
int height,
|
||||
int step,
|
||||
int f_offset,
|
||||
int d_offset
|
||||
)
|
||||
{
|
||||
__global float * factors = buffer + f_offset;
|
||||
__global float * dst = buffer + d_offset;
|
||||
|
||||
int j = get_global_id(0);
|
||||
int i = get_global_id(1);
|
||||
|
||||
if(j >= width || i >= height)
|
||||
{
|
||||
return;
|
||||
}
|
||||
float scale = factors[step * i + j];
|
||||
float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
|
||||
|
||||
dst[step * i + j] *= invScale;
|
||||
}
|
||||
|
||||
__kernel void forwardWarpKernel(
|
||||
__global const float * src,
|
||||
__global float * buffer,
|
||||
__global const float * u,
|
||||
__global const float * v,
|
||||
const int w,
|
||||
const int h,
|
||||
const int flow_stride,
|
||||
const int image_stride,
|
||||
const int factor_offset,
|
||||
const int dst_offset,
|
||||
const float time_scale
|
||||
)
|
||||
{
|
||||
int j = get_global_id(0);
|
||||
int i = get_global_id(1);
|
||||
|
||||
if (i >= h || j >= w) return;
|
||||
|
||||
volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
|
||||
volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
|
||||
|
||||
int flow_row_offset = i * flow_stride;
|
||||
int image_row_offset = i * image_stride;
|
||||
|
||||
//bottom left corner of a target pixel
|
||||
float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
|
||||
float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
|
||||
// pixel containing bottom left corner
|
||||
float px;
|
||||
float py;
|
||||
float dx = modf(cx, &px);
|
||||
float dy = modf(cy, &py);
|
||||
// target pixel integer coords
|
||||
int tx;
|
||||
int ty;
|
||||
tx = (int) px;
|
||||
ty = (int) py;
|
||||
float value = src[image_row_offset + j];
|
||||
float weight;
|
||||
// fill pixel containing bottom right corner
|
||||
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
|
||||
{
|
||||
weight = dx * dy;
|
||||
atomic_addf(dst + ty * image_stride + tx, value * weight);
|
||||
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
|
||||
}
|
||||
|
||||
// fill pixel containing bottom left corner
|
||||
tx -= 1;
|
||||
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
|
||||
{
|
||||
weight = (1.0f - dx) * dy;
|
||||
atomic_addf(dst + ty * image_stride + tx, value * weight);
|
||||
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
|
||||
}
|
||||
|
||||
// fill pixel containing upper left corner
|
||||
ty -= 1;
|
||||
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
|
||||
{
|
||||
weight = (1.0f - dx) * (1.0f - dy);
|
||||
atomic_addf(dst + ty * image_stride + tx, value * weight);
|
||||
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
|
||||
}
|
||||
|
||||
// fill pixel containing upper right corner
|
||||
tx += 1;
|
||||
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
|
||||
{
|
||||
weight = dx * (1.0f - dy);
|
||||
atomic_addf(dst + ty * image_stride + tx, value * weight);
|
||||
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
|
||||
}
|
||||
}
|
||||
|
||||
// define buffer offsets
|
||||
enum
|
||||
{
|
||||
O0_OS = 0,
|
||||
O1_OS,
|
||||
U_OS,
|
||||
V_OS,
|
||||
UR_OS,
|
||||
VR_OS
|
||||
};
|
||||
|
||||
__kernel void blendFramesKernel(
|
||||
image2d_t tex_src0,
|
||||
image2d_t tex_src1,
|
||||
__global float * buffer,
|
||||
__global float * out,
|
||||
int w,
|
||||
int h,
|
||||
int step,
|
||||
float theta
|
||||
)
|
||||
{
|
||||
__global float * u = buffer + h * step * U_OS;
|
||||
__global float * v = buffer + h * step * V_OS;
|
||||
__global float * ur = buffer + h * step * UR_OS;
|
||||
__global float * vr = buffer + h * step * VR_OS;
|
||||
__global float * o0 = buffer + h * step * O0_OS;
|
||||
__global float * o1 = buffer + h * step * O1_OS;
|
||||
|
||||
int ix = get_global_id(0);
|
||||
int iy = get_global_id(1);
|
||||
|
||||
if(ix >= w || iy >= h) return;
|
||||
|
||||
int pos = ix + step * iy;
|
||||
|
||||
float _u = u[pos];
|
||||
float _v = v[pos];
|
||||
|
||||
float _ur = ur[pos];
|
||||
float _vr = vr[pos];
|
||||
|
||||
float x = (float)ix + 0.5f;
|
||||
float y = (float)iy + 0.5f;
|
||||
bool b0 = o0[pos] > 1e-4f;
|
||||
bool b1 = o1[pos] > 1e-4f;
|
||||
|
||||
float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
|
||||
float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
|
||||
|
||||
if (b0 && b1)
|
||||
{
|
||||
// pixel is visible on both frames
|
||||
out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) +
|
||||
read_imagef(tex_src1, sampler, coord1).x * theta;
|
||||
}
|
||||
else if (b0)
|
||||
{
|
||||
// visible on the first frame only
|
||||
out[pos] = read_imagef(tex_src0, sampler, coord0).x;
|
||||
}
|
||||
else
|
||||
{
|
||||
// visible on the second frame only
|
||||
out[pos] = read_imagef(tex_src1, sampler, coord1).x;
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -50,8 +50,8 @@ typedef double F;
|
||||
typedef float F;
|
||||
#endif
|
||||
|
||||
short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
|
||||
__global uchar4* in, int in_step, int dst_off, int src_off,
|
||||
short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
|
||||
__global uchar4* in, int in_step, int dst_off, int src_off,
|
||||
int cols, int rows, int sp, int sr, int maxIter, float eps)
|
||||
{
|
||||
int isr2 = sr*sr;
|
||||
@@ -81,9 +81,9 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
|
||||
for( int y = miny; y <= maxy; y++)
|
||||
{
|
||||
int rowCount = 0;
|
||||
int x = minx;
|
||||
int x = minx;
|
||||
for( ; x+3 <= maxx; x+=4 )
|
||||
{
|
||||
{
|
||||
int id = src_off + y*in_step + x;
|
||||
uchar16 t = (uchar16)(in[id],in[id+1],in[id+2],in[id+3]);
|
||||
int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
|
||||
@@ -126,7 +126,7 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
|
||||
s.x += t.s0; s.y += t.s1; s.z += t.s2;
|
||||
sx += x; rowCount++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if(x+1 == maxx)
|
||||
{
|
||||
@@ -213,32 +213,32 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void meanshift_kernel(__global uchar4* out, int out_step,
|
||||
__global uchar4* in, int in_step,
|
||||
__kernel void meanshift_kernel(__global uchar4* out, int out_step,
|
||||
__global uchar4* in, int in_step,
|
||||
int dst_off, int src_off, int cols, int rows,
|
||||
int sp, int sr, int maxIter, float eps)
|
||||
{
|
||||
int x0 = get_global_id(0);
|
||||
int y0 = get_global_id(1);
|
||||
int x0 = get_global_id(0);
|
||||
int y0 = get_global_id(1);
|
||||
if( x0 < cols && y0 < rows )
|
||||
do_mean_shift(x0, y0, out, out_step, in, in_step, dst_off, src_off,
|
||||
cols, rows, sp, sr, maxIter, eps);
|
||||
}
|
||||
|
||||
__kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
|
||||
__global short2* outsp, int instep, int outrstep,
|
||||
__kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
|
||||
__global short2* outsp, int instep, int outrstep,
|
||||
int outspstep, int in_off, int outr_off, int outsp_off,
|
||||
int cols, int rows, int sp, int sr, int maxIter, float eps )
|
||||
{
|
||||
int x0 = get_global_id(0);
|
||||
int y0 = get_global_id(1);
|
||||
int x0 = get_global_id(0);
|
||||
int y0 = get_global_id(1);
|
||||
|
||||
if( x0 < cols && y0 < rows )
|
||||
{
|
||||
//int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
|
||||
//*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
|
||||
// we have ensured before that ((outspstep & 0x11)==0).
|
||||
outsp_off >>= 2;
|
||||
outsp_off >>= 2;
|
||||
outspstep >>= 2;
|
||||
int basesp = outsp_off + y0 * outspstep + x0;
|
||||
outsp[basesp] = do_mean_shift(x0, y0, outr, outrstep, in, instep, outr_off, in_off, cols, rows, sp, sr, maxIter, eps);
|
||||
|
||||
@@ -59,25 +59,25 @@ __kernel void merge_vector_C2_D0(__global uchar *mat_dst, int dst_step, int ds
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset & 3) >> 1)
|
||||
int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
|
||||
int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
|
||||
int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
|
||||
int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
__global uchar4 * dst = (__global uchar4 *)(mat_dst + dst_index);
|
||||
__global uchar * src0 = mat_src0 + src0_index;
|
||||
__global uchar * src1 = src0 + 1;
|
||||
__global uchar * src2 = mat_src1 + src1_index;
|
||||
__global uchar * src3 = src2 + 1;
|
||||
__global uchar4 * dst = (__global uchar4 *)(mat_dst + dst_index);
|
||||
__global uchar * src0 = mat_src0 + src0_index;
|
||||
__global uchar * src1 = src0 + 1;
|
||||
__global uchar * src2 = mat_src1 + src1_index;
|
||||
__global uchar * src3 = src2 + 1;
|
||||
|
||||
uchar4 dst_data = *dst;
|
||||
uchar data_0 = *(src0);
|
||||
@@ -87,8 +87,8 @@ __kernel void merge_vector_C2_D0(__global uchar *mat_dst, int dst_step, int ds
|
||||
|
||||
uchar4 tmp_data = (uchar4)(data_0, data_2, data_1, data_3);
|
||||
|
||||
tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
|
||||
tmp_data.zw = dst_index + 2 < dst_end ? tmp_data.zw : dst_data.zw;
|
||||
tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
|
||||
tmp_data.zw = dst_index + 2 < dst_end ? tmp_data.zw : dst_data.zw;
|
||||
|
||||
*dst = tmp_data;
|
||||
}
|
||||
@@ -100,25 +100,25 @@ __kernel void merge_vector_C2_D1(__global char *mat_dst, int dst_step, int dst
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset & 3) >> 1)
|
||||
int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
|
||||
int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
|
||||
int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
|
||||
int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
__global char4 * dst = (__global char4 *)(mat_dst + dst_index);
|
||||
__global char * src0 = mat_src0 + src0_index;
|
||||
__global char * src1 = src0 + 1;
|
||||
__global char * src2 = mat_src1 + src1_index;
|
||||
__global char * src3 = src2 + 1;
|
||||
__global char4 * dst = (__global char4 *)(mat_dst + dst_index);
|
||||
__global char * src0 = mat_src0 + src0_index;
|
||||
__global char * src1 = src0 + 1;
|
||||
__global char * src2 = mat_src1 + src1_index;
|
||||
__global char * src3 = src2 + 1;
|
||||
|
||||
char4 dst_data = *dst;
|
||||
char data_0 = *(src0);
|
||||
@@ -128,8 +128,8 @@ __kernel void merge_vector_C2_D1(__global char *mat_dst, int dst_step, int dst
|
||||
|
||||
char4 tmp_data = (char4)(data_0, data_2, data_1, data_3);
|
||||
|
||||
tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
|
||||
tmp_data.zw = dst_index + 2 < dst_end ? tmp_data.zw : dst_data.zw;
|
||||
tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
|
||||
tmp_data.zw = dst_index + 2 < dst_end ? tmp_data.zw : dst_data.zw;
|
||||
|
||||
*dst = tmp_data;
|
||||
}
|
||||
@@ -141,12 +141,12 @@ __kernel void merge_vector_C2_D2(__global ushort *mat_dst, int dst_step, int d
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
@@ -167,12 +167,12 @@ __kernel void merge_vector_C2_D3(__global short *mat_dst, int dst_step, int ds
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
@@ -193,12 +193,12 @@ __kernel void merge_vector_C2_D4(__global int *mat_dst, int dst_step, int dst_
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
|
||||
@@ -213,12 +213,12 @@ __kernel void merge_vector_C2_D5(__global float *mat_dst, int dst_step, int ds
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
|
||||
@@ -235,12 +235,12 @@ __kernel void merge_vector_C2_D6(__global double *mat_dst, int dst_step, int d
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
double src0 = *((__global double *)((__global uchar *)mat_src0 + src0_index + (x << 3)));
|
||||
@@ -258,8 +258,8 @@ __kernel void merge_vector_C3_D0(__global uchar *mat_dst, int dst_step, int ds
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
@@ -268,8 +268,8 @@ __kernel void merge_vector_C3_D0(__global uchar *mat_dst, int dst_step, int ds
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - offset_cols);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - offset_cols);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + 3 * x - offset_cols * 3);
|
||||
|
||||
uchar data0_0 = *(mat_src0 + src0_index + 0);
|
||||
@@ -322,8 +322,8 @@ __kernel void merge_vector_C3_D1(__global char *mat_dst, int dst_step, int dst
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
@@ -332,8 +332,8 @@ __kernel void merge_vector_C3_D1(__global char *mat_dst, int dst_step, int dst
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - offset_cols);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - offset_cols);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + 3 * x - offset_cols * 3);
|
||||
|
||||
char data0_0 = *(mat_src0 + src0_index + 0);
|
||||
@@ -386,8 +386,8 @@ __kernel void merge_vector_C3_D2(__global ushort *mat_dst, int dst_step, int d
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
@@ -396,8 +396,8 @@ __kernel void merge_vector_C3_D2(__global ushort *mat_dst, int dst_step, int d
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - offset_cols);
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - offset_cols);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + 6 * x - offset_cols * 6);
|
||||
|
||||
ushort data0_0 = *((__global ushort *)((__global char *)mat_src0 + src0_index + 0));
|
||||
@@ -438,8 +438,8 @@ __kernel void merge_vector_C3_D3(__global short *mat_dst, int dst_step, int ds
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
@@ -448,8 +448,8 @@ __kernel void merge_vector_C3_D3(__global short *mat_dst, int dst_step, int ds
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - offset_cols);
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - offset_cols);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + 6 * x - offset_cols * 6);
|
||||
|
||||
short data0_0 = *((__global short *)((__global char *)mat_src0 + src0_index + 0));
|
||||
@@ -490,13 +490,13 @@ __kernel void merge_vector_C3_D4(__global int *mat_dst, int dst_step, int dst_
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
@@ -524,13 +524,13 @@ __kernel void merge_vector_C3_D5(__global float *mat_dst, int dst_step, int ds
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
@@ -560,13 +560,13 @@ __kernel void merge_vector_C3_D6(__global double *mat_dst, int dst_step, int d
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
@@ -596,14 +596,14 @@ __kernel void merge_vector_C4_D0(__global uchar *mat_dst, int dst_step, int ds
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
uchar src0 = *(mat_src0 + src0_index + x );
|
||||
@@ -622,14 +622,14 @@ __kernel void merge_vector_C4_D1(__global char *mat_dst, int dst_step, int dst
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
char src0 = *(mat_src0 + src0_index + x );
|
||||
@@ -648,14 +648,14 @@ __kernel void merge_vector_C4_D2(__global ushort *mat_dst, int dst_step, int d
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
ushort src0 = *((__global ushort *)((__global uchar *)mat_src0 + src0_index + (x << 1)));
|
||||
@@ -674,14 +674,14 @@ __kernel void merge_vector_C4_D3(__global short *mat_dst, int dst_step, int ds
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
short src0 = *((__global short *)((__global uchar *)mat_src0 + src0_index + (x << 1)));
|
||||
@@ -700,14 +700,14 @@ __kernel void merge_vector_C4_D4(__global int *mat_dst, int dst_step, int dst_
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
|
||||
@@ -726,14 +726,14 @@ __kernel void merge_vector_C4_D5(__global float *mat_dst, int dst_step, int ds
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
|
||||
@@ -754,14 +754,14 @@ __kernel void merge_vector_C4_D6(__global double *mat_dst, int dst_step, int d
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int src0_index = mad24(y, src0_step, src0_offset);
|
||||
int src1_index = mad24(y, src1_step, src1_offset);
|
||||
int src2_index = mad24(y, src2_step, src2_offset);
|
||||
int src3_index = mad24(y, src3_step, src3_offset);
|
||||
int dst_index = mad24(y, dst_step , dst_offset);
|
||||
|
||||
double src0 = *((__global double *)((__global uchar *)mat_src0 + src0_index + (x << 3)));
|
||||
@@ -783,8 +783,8 @@ __kernel void merge_vector_C2_D0_1(int rows, int cols,
|
||||
__global uchar *mat_src1, int src1_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global uchar4 *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
|
||||
@@ -807,8 +807,8 @@ __kernel void merge_vector_C2_D1_1(int rows, int cols,
|
||||
__global char *mat_src1, int src1_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global char4 *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
|
||||
@@ -831,8 +831,8 @@ __kernel void merge_vector_C2_D2_1(int rows, int cols,
|
||||
__global ushort *mat_src1, int src1_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global ushort2 *src0_y = (__global ushort2 *)((__global uchar *)mat_src0 + y * src0_step);
|
||||
@@ -855,8 +855,8 @@ __kernel void merge_vector_C2_D3_1(int rows, int cols,
|
||||
__global short *mat_src1, int src1_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global short2 *src0_y = (__global short2 *)((__global uchar *)mat_src0 + y * src0_step);
|
||||
@@ -880,8 +880,8 @@ __kernel void merge_vector_C2_D4_1(int rows, int cols,
|
||||
__global int *mat_src1, int src1_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global int *src0_y = (__global int *)((__global uchar *)mat_src0 + y * src0_step);
|
||||
@@ -904,8 +904,8 @@ __kernel void merge_vector_C2_D5_1(int rows, int cols,
|
||||
__global float *mat_src1, int src1_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global float *src0_y = (__global float *)((__global uchar *)mat_src0 + y * src0_step);
|
||||
@@ -915,7 +915,7 @@ __kernel void merge_vector_C2_D5_1(int rows, int cols,
|
||||
float value1 = src0_y[x];
|
||||
float value2 = src1_y[x];
|
||||
|
||||
dst_y[x] = (float2)(value1, value2);
|
||||
dst_y[x] = (float2)(value1, value2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -926,8 +926,8 @@ __kernel void merge_vector_C2_D6_1(int rows, int cols,
|
||||
__global double *mat_src1, int src1_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global double *src0_y = (__global double *)((__global uchar *)mat_src0 + y * src0_step);
|
||||
@@ -949,8 +949,8 @@ __kernel void merge_vector_C3_D0_1(int rows, int cols,
|
||||
__global uchar *mat_src2, int src2_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global uchar4 *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
|
||||
@@ -981,8 +981,8 @@ __kernel void merge_vector_C3_D1_1(int rows, int cols,
|
||||
__global char *mat_src2, int src2_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global char4 *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
|
||||
@@ -1027,8 +1027,8 @@ __kernel void merge_vector_C3_D2_1(int rows, int cols,
|
||||
__global ushort *mat_src2, int src2_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global ushort2 *src0_y = (__global ushort2 * )((__global char *)mat_src0 + y * src0_step);
|
||||
@@ -1054,8 +1054,8 @@ __kernel void merge_vector_C3_D3_1(int rows, int cols,
|
||||
__global short *mat_src2, int src2_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global short2 *src0_y = (__global short2 * )((__global char *)mat_src0 + y * src0_step);
|
||||
@@ -1091,8 +1091,8 @@ __kernel void merge_vector_C3_D4_1(int rows, int cols,
|
||||
__global int *mat_src2, int src2_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global int *src0_y = (__global int * )((__global char *)mat_src0 + y * src0_step);
|
||||
@@ -1123,8 +1123,8 @@ __kernel void merge_vector_C3_D5_1(int rows, int cols,
|
||||
__global float *mat_src2, int src2_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global float *src0_y = (__global float * )((__global char *)mat_src0 + y * src0_step);
|
||||
@@ -1151,8 +1151,8 @@ __kernel void merge_vector_C3_D6_1(int rows, int cols,
|
||||
__global double *mat_src2, int src2_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global double *src0_y = (__global double * )((__global char *)mat_src0 + y * src0_step);
|
||||
@@ -1179,8 +1179,8 @@ __kernel void merge_vector_C4_D0_1(int rows, int cols,
|
||||
__global uchar *mat_src3, int src3_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global uchar4 *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
|
||||
@@ -1196,7 +1196,7 @@ __kernel void merge_vector_C4_D0_1(int rows, int cols,
|
||||
uchar4 value3 = src3_y[x];
|
||||
|
||||
dst_y[x] = (uchar16)(value0.x, value1.x, value2.x, value3.x,
|
||||
value0.y, value1.y, value2.y, value3.y,
|
||||
value0.y, value1.y, value2.y, value3.y,
|
||||
value0.z, value1.z, value2.z, value3.z,
|
||||
value0.w, value1.w, value2.w, value3.w);
|
||||
}
|
||||
@@ -1210,8 +1210,8 @@ __kernel void merge_vector_C4_D1_1(int rows, int cols,
|
||||
__global char *mat_src3, int src3_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global char4 *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
|
||||
@@ -1227,7 +1227,7 @@ __kernel void merge_vector_C4_D1_1(int rows, int cols,
|
||||
char4 value3 = src3_y[x];
|
||||
|
||||
dst_y[x] = (char16)(value0.x, value1.x, value2.x, value3.x,
|
||||
value0.y, value1.y, value2.y, value3.y,
|
||||
value0.y, value1.y, value2.y, value3.y,
|
||||
value0.z, value1.z, value2.z, value3.z,
|
||||
value0.w, value1.w, value2.w, value3.w);
|
||||
}
|
||||
@@ -1240,8 +1240,8 @@ __kernel void merge_vector_C4_D2_1(int rows, int cols,
|
||||
__global ushort *mat_src3, int src3_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global ushort2 *src0_y = (__global ushort2 * )((__global uchar*)mat_src0 + y * src0_step);
|
||||
@@ -1257,7 +1257,7 @@ __kernel void merge_vector_C4_D2_1(int rows, int cols,
|
||||
ushort2 value3 = src3_y[x];
|
||||
|
||||
dst_y[x] = (ushort8)(value0.x, value1.x, value2.x, value3.x,
|
||||
value0.y, value1.y, value2.y, value3.y);
|
||||
value0.y, value1.y, value2.y, value3.y);
|
||||
}
|
||||
}
|
||||
__kernel void merge_vector_C4_D3_1(int rows, int cols,
|
||||
@@ -1268,8 +1268,8 @@ __kernel void merge_vector_C4_D3_1(int rows, int cols,
|
||||
__global short *mat_src3, int src3_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global short2 *src0_y = (__global short2 * )((__global uchar*)mat_src0 + y * src0_step);
|
||||
@@ -1285,7 +1285,7 @@ __kernel void merge_vector_C4_D3_1(int rows, int cols,
|
||||
short2 value3 = src3_y[x];
|
||||
|
||||
dst_y[x] = (short8)(value0.x, value1.x, value2.x, value3.x,
|
||||
value0.y, value1.y, value2.y, value3.y);
|
||||
value0.y, value1.y, value2.y, value3.y);
|
||||
}
|
||||
}
|
||||
__kernel void merge_vector_C4_D4_1(int rows, int cols,
|
||||
@@ -1296,8 +1296,8 @@ __kernel void merge_vector_C4_D4_1(int rows, int cols,
|
||||
__global int *mat_src3, int src3_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global int *src0_y = (__global int * )((__global uchar*)mat_src0 + y * src0_step);
|
||||
@@ -1323,8 +1323,8 @@ __kernel void merge_vector_C4_D5_1(int rows, int cols,
|
||||
__global float *mat_src3, int src3_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global float *src0_y = (__global float * )((__global uchar*)mat_src0 + y * src0_step);
|
||||
@@ -1352,8 +1352,8 @@ __kernel void merge_vector_C4_D6_1(int rows, int cols,
|
||||
__global double *mat_src3, int src3_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
int y = get_global_id(1);
|
||||
|
||||
if ((x < cols) && (y < rows))
|
||||
{
|
||||
__global double *src0_y = (__global double * )((__global uchar*)mat_src0 + y * src0_step);
|
||||
|
||||
@@ -210,7 +210,7 @@ __kernel void icvCalcLayerDetAndTrace(
|
||||
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
|
||||
|
||||
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
|
||||
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
|
||||
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -246,9 +246,9 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
|
||||
// Non-maximal suppression to further filtering the candidates from previous step
|
||||
__kernel
|
||||
void icvFindMaximaInLayer_withmask(
|
||||
__global const float * det,
|
||||
__global const float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
__global const float * det,
|
||||
__global const float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
volatile __global unsigned int* maxCounter,
|
||||
int counter_offset,
|
||||
int det_step, // the step of det in bytes
|
||||
@@ -288,26 +288,26 @@ __kernel
|
||||
// Is this thread within the hessian buffer?
|
||||
const int zoff = get_local_size(0) * get_local_size(1);
|
||||
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
|
||||
N9[localLin - zoff] =
|
||||
det[det_step *
|
||||
N9[localLin - zoff] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
N9[localLin ] =
|
||||
det[det_step *
|
||||
N9[localLin ] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
N9[localLin + zoff] =
|
||||
det[det_step *
|
||||
N9[localLin + zoff] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (i < c_layer_rows - margin
|
||||
if (i < c_layer_rows - margin
|
||||
&& j < c_layer_cols - margin
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) < get_local_size(0) - 1
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
|
||||
)
|
||||
{
|
||||
@@ -372,9 +372,9 @@ __kernel
|
||||
|
||||
__kernel
|
||||
void icvFindMaximaInLayer(
|
||||
__global float * det,
|
||||
__global float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
__global float * det,
|
||||
__global float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
volatile __global unsigned int* maxCounter,
|
||||
int counter_offset,
|
||||
int det_step, // the step of det in bytes
|
||||
@@ -417,19 +417,19 @@ __kernel
|
||||
int l_x = min(max(j, 0), c_img_cols - 1);
|
||||
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
|
||||
|
||||
N9[localLin - zoff] =
|
||||
N9[localLin - zoff] =
|
||||
det[det_step * (l_y - c_layer_rows) + l_x];
|
||||
N9[localLin ] =
|
||||
N9[localLin ] =
|
||||
det[det_step * (l_y ) + l_x];
|
||||
N9[localLin + zoff] =
|
||||
N9[localLin + zoff] =
|
||||
det[det_step * (l_y + c_layer_rows) + l_x];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (i < c_layer_rows - margin
|
||||
if (i < c_layer_rows - margin
|
||||
&& j < c_layer_cols - margin
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) < get_local_size(0) - 1
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
|
||||
)
|
||||
{
|
||||
@@ -497,17 +497,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
|
||||
{
|
||||
F invdet = 1.0 / det;
|
||||
|
||||
x[0] = invdet *
|
||||
x[0] = invdet *
|
||||
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
|
||||
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
|
||||
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
|
||||
|
||||
x[1] = invdet *
|
||||
x[1] = invdet *
|
||||
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
|
||||
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
|
||||
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
|
||||
|
||||
x[2] = invdet *
|
||||
x[2] = invdet *
|
||||
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
|
||||
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
|
||||
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
|
||||
@@ -528,9 +528,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// INTERPOLATION
|
||||
__kernel
|
||||
__kernel
|
||||
void icvInterpolateKeypoint(
|
||||
__global const float * det,
|
||||
__global const float * det,
|
||||
__global const int4 * maxPosBuffer,
|
||||
__global float * keypoints,
|
||||
volatile __global unsigned int * featureCounter,
|
||||
@@ -560,7 +560,7 @@ __kernel
|
||||
|
||||
volatile __local float N9[3][3][3];
|
||||
|
||||
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
|
||||
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
|
||||
det[det_step * (c_layer_rows * layer + i) + j];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -658,27 +658,27 @@ __kernel
|
||||
|
||||
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
|
||||
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
|
||||
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
|
||||
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
|
||||
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
|
||||
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
|
||||
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
|
||||
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
|
||||
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
|
||||
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
|
||||
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
|
||||
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
|
||||
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
|
||||
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
|
||||
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
|
||||
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
|
||||
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
|
||||
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
|
||||
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
|
||||
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
|
||||
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
|
||||
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
|
||||
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
|
||||
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
|
||||
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
|
||||
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
|
||||
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
|
||||
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
|
||||
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
|
||||
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
|
||||
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
|
||||
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
|
||||
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
|
||||
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
|
||||
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
|
||||
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
|
||||
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
|
||||
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
|
||||
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
|
||||
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
|
||||
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
|
||||
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
|
||||
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
|
||||
0.001707611023448408f, 0.001455130288377404f};
|
||||
|
||||
@@ -691,13 +691,13 @@ void reduce_32_sum(volatile __local float * data, float partial_reduction, int
|
||||
data[tid] = partial_reduction;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 16)
|
||||
if (tid < 16)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
#undef op
|
||||
}
|
||||
@@ -758,7 +758,7 @@ __kernel
|
||||
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
|
||||
|
||||
angle = atan2(Y, X);
|
||||
|
||||
|
||||
if (angle < 0)
|
||||
angle += 2.0f * CV_PI_F;
|
||||
angle *= 180.0f / CV_PI_F;
|
||||
@@ -769,7 +769,7 @@ __kernel
|
||||
s_Y[tid] = Y;
|
||||
s_angle[tid] = angle;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
float bestx = 0, besty = 0, best_mod = 0;
|
||||
|
||||
#pragma unroll
|
||||
@@ -881,8 +881,8 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
|
||||
|
||||
// utility for linear filter
|
||||
inline uchar readerGet(
|
||||
image2d_t src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
image2d_t src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
int i, int j
|
||||
)
|
||||
{
|
||||
@@ -892,8 +892,8 @@ inline uchar readerGet(
|
||||
}
|
||||
|
||||
inline float linearFilter(
|
||||
image2d_t src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
image2d_t src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
float y, float x
|
||||
)
|
||||
{
|
||||
@@ -927,9 +927,9 @@ void calc_dx_dy(
|
||||
volatile __local float s_dx_bin[25],
|
||||
volatile __local float s_dy_bin[25],
|
||||
volatile __local float s_PATCH[6][6],
|
||||
__global const float* featureX,
|
||||
__global const float* featureY,
|
||||
__global const float* featureSize,
|
||||
__global const float* featureX,
|
||||
__global const float* featureY,
|
||||
__global const float* featureSize,
|
||||
__global const float* featureDir
|
||||
)
|
||||
{
|
||||
@@ -976,26 +976,26 @@ void calc_dx_dy(
|
||||
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
|
||||
|
||||
const float vx = (
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
|
||||
* dw;
|
||||
const float vy = (
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
|
||||
* dw;
|
||||
s_dx_bin[tid] = vx;
|
||||
s_dy_bin[tid] = vy;
|
||||
}
|
||||
}
|
||||
void reduce_sum25(
|
||||
volatile __local float* sdata1,
|
||||
volatile __local float* sdata2,
|
||||
volatile __local float* sdata3,
|
||||
volatile __local float* sdata4,
|
||||
volatile __local float* sdata1,
|
||||
volatile __local float* sdata2,
|
||||
volatile __local float* sdata3,
|
||||
volatile __local float* sdata4,
|
||||
int tid
|
||||
)
|
||||
{
|
||||
@@ -1033,10 +1033,10 @@ void reduce_sum25(
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__kernel
|
||||
void compute_descriptors64(
|
||||
image2d_t imgTex,
|
||||
volatile __global float * descriptors,
|
||||
volatile __global float * descriptors,
|
||||
__global const float * keypoints,
|
||||
int descriptors_step,
|
||||
int keypoints_step
|
||||
@@ -1083,10 +1083,10 @@ __kernel
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel
|
||||
__kernel
|
||||
void compute_descriptors128(
|
||||
image2d_t imgTex,
|
||||
__global volatile float * descriptors,
|
||||
__global volatile float * descriptors,
|
||||
__global float * keypoints,
|
||||
int descriptors_step,
|
||||
int keypoints_step
|
||||
@@ -1178,7 +1178,7 @@ __kernel
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__kernel
|
||||
void normalize_descriptors128(__global float * descriptors, int descriptors_step)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
||||
@@ -1219,7 +1219,7 @@ __kernel
|
||||
// normalize and store in output
|
||||
descriptor_base[get_local_id(0)] = lookup / len;
|
||||
}
|
||||
__kernel
|
||||
__kernel
|
||||
void normalize_descriptors64(__global float * descriptors, int descriptors_step)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
||||
|
||||
@@ -54,10 +54,10 @@
|
||||
//----------------------------------------------------------------------------
|
||||
// Histogram computation
|
||||
|
||||
__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y,
|
||||
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
||||
const int grad_quadstep, const int qangle_step,
|
||||
__global const float* grad, __global const uchar* qangle,
|
||||
__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y,
|
||||
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
||||
const int grad_quadstep, const int qangle_step,
|
||||
__global const float* grad, __global const uchar* qangle,
|
||||
const float scale, __global float* block_hists, __local float* smem)
|
||||
{
|
||||
const int lidX = get_local_id(0);
|
||||
@@ -213,10 +213,10 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
|
||||
products[tid] = product;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
if (tid < 128) products[tid] = product = product + products[tid + 128];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
if (tid < 64) products[tid] = product = product + products[tid + 64];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -240,12 +240,12 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
|
||||
|
||||
__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
|
||||
const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float* block_hists, __global float* descriptors)
|
||||
__global const float* block_hists, __global float* descriptors)
|
||||
{
|
||||
int tid = get_local_id(0);
|
||||
int gidX = get_group_id(0);
|
||||
int gidY = get_group_id(1);
|
||||
|
||||
|
||||
// Get left top corner of the window in src
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
@@ -261,7 +261,7 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
|
||||
__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
|
||||
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
|
||||
const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
|
||||
{
|
||||
@@ -291,8 +291,8 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
|
||||
//----------------------------------------------------------------------------
|
||||
// Gradients computation
|
||||
|
||||
__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
const __global uchar4 * img, __global float * grad, __global uchar * qangle,
|
||||
__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
const __global uchar4 * img, __global float * grad, __global uchar * qangle,
|
||||
const float angle_scale, const char correct_gamma, const int cnbins)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
@@ -391,7 +391,7 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
|
||||
}
|
||||
|
||||
__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
__global const uchar * img, __global float * grad, __global uchar * qangle,
|
||||
__global const uchar * img, __global float * grad, __global uchar * qangle,
|
||||
const float angle_scale, const char correct_gamma, const int cnbins)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
@@ -453,37 +453,37 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
|
||||
// Resize
|
||||
|
||||
__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
|
||||
int dst_offset, int src_offset, int dst_step, int src_step,
|
||||
int dst_offset, int src_offset, int dst_step, int src_step,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
int sx = (int)floor(dx*ifx+0.5f);
|
||||
int sy = (int)floor(dy*ify+0.5f);
|
||||
sx = min(sx, src_cols-1);
|
||||
sy = min(sy, src_rows-1);
|
||||
int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
|
||||
int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
|
||||
|
||||
|
||||
if(dx<dst_cols && dy<dst_rows)
|
||||
dst[dpos] = src[spos];
|
||||
}
|
||||
|
||||
__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
|
||||
int dst_offset, int src_offset, int dst_step, int src_step,
|
||||
int dst_offset, int src_offset, int dst_step, int src_step,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
|
||||
int sx = (int)floor(dx*ifx+0.5f);
|
||||
int sy = (int)floor(dy*ify+0.5f);
|
||||
sx = min(sx, src_cols-1);
|
||||
sy = min(sy, src_rows-1);
|
||||
int dpos = dst_offset + dy * dst_step + dx;
|
||||
int spos = src_offset + sy * src_step + sx;
|
||||
|
||||
|
||||
if(dx<dst_cols && dy<dst_rows)
|
||||
dst[dpos] = src[spos];
|
||||
}
|
||||
@@ -37,348 +37,348 @@
|
||||
#define F2 float2
|
||||
#define F4 float4
|
||||
__kernel void convert_to_S4_C1_D0(
|
||||
__global const int* restrict srcMat,
|
||||
__global uchar* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const int* restrict srcMat,
|
||||
__global uchar* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0)<<2;
|
||||
int y=get_global_id(1);
|
||||
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
|
||||
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
|
||||
int off_src = (dstoffset_in_pixel & 3);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
|
||||
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
|
||||
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
|
||||
if(x+3<cols && y<rows && off_src==0)
|
||||
{
|
||||
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
|
||||
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(x+3<cols && y<rows)
|
||||
{
|
||||
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
|
||||
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
dstMat[dstidx+2] = temp_dst.z;
|
||||
dstMat[dstidx+3] = temp_dst.w;
|
||||
}
|
||||
else if(x+2<cols && y<rows)
|
||||
{
|
||||
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
|
||||
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
dstMat[dstidx+2] = temp_dst.z;
|
||||
}
|
||||
else if(x+1<cols && y<rows)
|
||||
{
|
||||
float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
|
||||
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
}
|
||||
else if(x<cols && y<rows)
|
||||
{
|
||||
dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
|
||||
}
|
||||
}
|
||||
int x=get_global_id(0)<<2;
|
||||
int y=get_global_id(1);
|
||||
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
|
||||
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
|
||||
int off_src = (dstoffset_in_pixel & 3);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
|
||||
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
|
||||
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
|
||||
if(x+3<cols && y<rows && off_src==0)
|
||||
{
|
||||
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
|
||||
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(x+3<cols && y<rows)
|
||||
{
|
||||
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
|
||||
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
dstMat[dstidx+2] = temp_dst.z;
|
||||
dstMat[dstidx+3] = temp_dst.w;
|
||||
}
|
||||
else if(x+2<cols && y<rows)
|
||||
{
|
||||
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
|
||||
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
dstMat[dstidx+2] = temp_dst.z;
|
||||
}
|
||||
else if(x+1<cols && y<rows)
|
||||
{
|
||||
float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
|
||||
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
}
|
||||
else if(x<cols && y<rows)
|
||||
{
|
||||
dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S4_C4_D0(
|
||||
__global const int4* restrict srcMat,
|
||||
__global uchar4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const int4* restrict srcMat,
|
||||
__global uchar4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = convert_float4(srcMat[srcidx]);
|
||||
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = convert_float4(srcMat[srcidx]);
|
||||
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S5_C1_D0(
|
||||
__global const float* restrict srcMat,
|
||||
__global uchar* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const float* restrict srcMat,
|
||||
__global uchar* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0)<<2;
|
||||
int y=get_global_id(1);
|
||||
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
|
||||
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
|
||||
int off_src = (dstoffset_in_pixel & 3);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
|
||||
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
|
||||
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
|
||||
if(x+3<cols && y<rows && off_src==0)
|
||||
{
|
||||
float4 temp_src = vload4(0,srcMat+srcidx);
|
||||
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(x+3<cols && y<rows)
|
||||
{
|
||||
float4 temp_src = vload4(0,srcMat+srcidx);
|
||||
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
dstMat[dstidx+2] = temp_dst.z;
|
||||
dstMat[dstidx+3] = temp_dst.w;
|
||||
}
|
||||
else if(x+2<cols && y<rows)
|
||||
{
|
||||
float4 temp_src = vload4(0,srcMat+srcidx);
|
||||
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
dstMat[dstidx+2] = temp_dst.z;
|
||||
}
|
||||
else if(x+1<cols && y<rows)
|
||||
{
|
||||
float2 temp_src = vload2(0,srcMat+srcidx);
|
||||
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
}
|
||||
else if(x<cols && y<rows)
|
||||
{
|
||||
dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
|
||||
}
|
||||
}
|
||||
int x=get_global_id(0)<<2;
|
||||
int y=get_global_id(1);
|
||||
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
|
||||
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
|
||||
int off_src = (dstoffset_in_pixel & 3);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
|
||||
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
|
||||
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
|
||||
if(x+3<cols && y<rows && off_src==0)
|
||||
{
|
||||
float4 temp_src = vload4(0,srcMat+srcidx);
|
||||
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(x+3<cols && y<rows)
|
||||
{
|
||||
float4 temp_src = vload4(0,srcMat+srcidx);
|
||||
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
dstMat[dstidx+2] = temp_dst.z;
|
||||
dstMat[dstidx+3] = temp_dst.w;
|
||||
}
|
||||
else if(x+2<cols && y<rows)
|
||||
{
|
||||
float4 temp_src = vload4(0,srcMat+srcidx);
|
||||
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
dstMat[dstidx+2] = temp_dst.z;
|
||||
}
|
||||
else if(x+1<cols && y<rows)
|
||||
{
|
||||
float2 temp_src = vload2(0,srcMat+srcidx);
|
||||
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
|
||||
dstMat[dstidx] = temp_dst.x;
|
||||
dstMat[dstidx+1] = temp_dst.y;
|
||||
}
|
||||
else if(x<cols && y<rows)
|
||||
{
|
||||
dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void convert_to_S5_C4_D0(
|
||||
__global const float4* restrict srcMat,
|
||||
__global uchar4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const float4* restrict srcMat,
|
||||
__global uchar4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = srcMat[srcidx];
|
||||
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = srcMat[srcidx];
|
||||
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S0_C1_D4(
|
||||
__global const uchar* restrict srcMat,
|
||||
__global int* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const uchar* restrict srcMat,
|
||||
__global int* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float temp_src = convert_float(srcMat[srcidx]);
|
||||
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float temp_src = convert_float(srcMat[srcidx]);
|
||||
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S5_C1_D4(
|
||||
__global const float* restrict srcMat,
|
||||
__global int* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const float* restrict srcMat,
|
||||
__global int* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float temp_src = srcMat[srcidx];
|
||||
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float temp_src = srcMat[srcidx];
|
||||
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S0_C4_D4(
|
||||
__global const uchar4* restrict srcMat,
|
||||
__global int4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const uchar4* restrict srcMat,
|
||||
__global int4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = convert_float4(srcMat[srcidx]);
|
||||
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = convert_float4(srcMat[srcidx]);
|
||||
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S5_C4_D4(
|
||||
__global const float4* restrict srcMat,
|
||||
__global int4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const float4* restrict srcMat,
|
||||
__global int4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = srcMat[srcidx];
|
||||
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = srcMat[srcidx];
|
||||
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S0_C1_D5(
|
||||
__global const uchar* restrict srcMat,
|
||||
__global float* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const uchar* restrict srcMat,
|
||||
__global float* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float temp_src = convert_float(srcMat[srcidx]);
|
||||
dstMat[dstidx] = temp_src*alpha+beta;
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float temp_src = convert_float(srcMat[srcidx]);
|
||||
dstMat[dstidx] = temp_src*alpha+beta;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S4_C1_D5(
|
||||
__global const int* restrict srcMat,
|
||||
__global float* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const int* restrict srcMat,
|
||||
__global float* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float temp_src = convert_float(srcMat[srcidx]);
|
||||
dstMat[dstidx] = temp_src*alpha+beta;
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float temp_src = convert_float(srcMat[srcidx]);
|
||||
dstMat[dstidx] = temp_src*alpha+beta;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S0_C4_D5(
|
||||
__global const uchar4* restrict srcMat,
|
||||
__global float4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const uchar4* restrict srcMat,
|
||||
__global float4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = convert_float4(srcMat[srcidx]);
|
||||
dstMat[dstidx] = temp_src*alpha+beta;
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = convert_float4(srcMat[srcidx]);
|
||||
dstMat[dstidx] = temp_src*alpha+beta;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void convert_to_S4_C4_D5(
|
||||
__global const int4* restrict srcMat,
|
||||
__global float4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
__global const int4* restrict srcMat,
|
||||
__global float4* dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
F alpha,
|
||||
F beta)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = convert_float4(srcMat[srcidx]);
|
||||
dstMat[dstidx] = temp_src*alpha+beta;
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
if ( (x < cols) & (y < rows) )
|
||||
{
|
||||
float4 temp_src = convert_float4(srcMat[srcidx]);
|
||||
dstMat[dstidx] = temp_src*alpha+beta;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,28 +35,28 @@
|
||||
//
|
||||
|
||||
__kernel void copy_to_with_mask(
|
||||
__global const GENTYPE* restrict srcMat,
|
||||
__global GENTYPE* dstMat,
|
||||
__global const uchar* restrict maskMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
int maskStep,
|
||||
int maskoffset)
|
||||
__global const GENTYPE* restrict srcMat,
|
||||
__global GENTYPE* dstMat,
|
||||
__global const uchar* restrict maskMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int srcStep_in_pixel,
|
||||
int srcoffset_in_pixel,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
int maskStep,
|
||||
int maskoffset)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
x = x< cols ? x: cols-1;
|
||||
y = y< rows ? y: rows-1;
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
int maskidx = mad24(y,maskStep,x+ maskoffset);
|
||||
uchar mask = maskMat[maskidx];
|
||||
if (mask)
|
||||
{
|
||||
dstMat[dstidx] = srcMat[srcidx];
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
x = x< cols ? x: cols-1;
|
||||
y = y< rows ? y: rows-1;
|
||||
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
int maskidx = mad24(y,maskStep,x+ maskoffset);
|
||||
uchar mask = maskMat[maskidx];
|
||||
if (mask)
|
||||
{
|
||||
dstMat[dstidx] = srcMat[srcidx];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,53 +38,53 @@
|
||||
__kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
|
||||
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
|
||||
{
|
||||
int x=get_global_id(0)<<2;
|
||||
int y=get_global_id(1);
|
||||
//int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
|
||||
//int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
|
||||
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
|
||||
uchar4 out;
|
||||
out.x = out.y = out.z = out.w = scalar;
|
||||
|
||||
if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
|
||||
{
|
||||
*(__global uchar4*)(dstMat+idx) = out;
|
||||
}
|
||||
else
|
||||
{
|
||||
if((x+3 < cols) && (y < rows))
|
||||
{
|
||||
dstMat[idx] = out.x;
|
||||
dstMat[idx+1] = out.y;
|
||||
dstMat[idx+2] = out.z;
|
||||
dstMat[idx+3] = out.w;
|
||||
}
|
||||
if((x+2 < cols) && (y < rows))
|
||||
{
|
||||
dstMat[idx] = out.x;
|
||||
dstMat[idx+1] = out.y;
|
||||
dstMat[idx+2] = out.z;
|
||||
}
|
||||
else if((x+1 < cols) && (y < rows))
|
||||
{
|
||||
dstMat[idx] = out.x;
|
||||
dstMat[idx+1] = out.y;
|
||||
}
|
||||
else if((x < cols) && (y < rows))
|
||||
{
|
||||
dstMat[idx] = out.x;
|
||||
}
|
||||
}
|
||||
int x=get_global_id(0)<<2;
|
||||
int y=get_global_id(1);
|
||||
//int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
|
||||
//int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
|
||||
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
|
||||
uchar4 out;
|
||||
out.x = out.y = out.z = out.w = scalar;
|
||||
|
||||
if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
|
||||
{
|
||||
*(__global uchar4*)(dstMat+idx) = out;
|
||||
}
|
||||
else
|
||||
{
|
||||
if((x+3 < cols) && (y < rows))
|
||||
{
|
||||
dstMat[idx] = out.x;
|
||||
dstMat[idx+1] = out.y;
|
||||
dstMat[idx+2] = out.z;
|
||||
dstMat[idx+3] = out.w;
|
||||
}
|
||||
if((x+2 < cols) && (y < rows))
|
||||
{
|
||||
dstMat[idx] = out.x;
|
||||
dstMat[idx+1] = out.y;
|
||||
dstMat[idx+2] = out.z;
|
||||
}
|
||||
else if((x+1 < cols) && (y < rows))
|
||||
{
|
||||
dstMat[idx] = out.x;
|
||||
dstMat[idx+1] = out.y;
|
||||
}
|
||||
else if((x < cols) && (y < rows))
|
||||
{
|
||||
dstMat[idx] = out.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void set_to_without_mask(GENTYPE scalar,__global GENTYPE * dstMat,
|
||||
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
if ( (x < cols) & (y < rows))
|
||||
{
|
||||
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
|
||||
dstMat[idx] = scalar;
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
if ( (x < cols) & (y < rows))
|
||||
{
|
||||
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
|
||||
dstMat[idx] = scalar;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,27 +34,27 @@
|
||||
//
|
||||
//
|
||||
__kernel void set_to_with_mask(
|
||||
GENTYPE scalar,
|
||||
__global GENTYPE * dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
GENTYPE scalar,
|
||||
__global GENTYPE * dstMat,
|
||||
int cols,
|
||||
int rows,
|
||||
int dstStep_in_pixel,
|
||||
int dstoffset_in_pixel,
|
||||
__global const uchar * restrict maskMat,
|
||||
int maskStep,
|
||||
int maskoffset)
|
||||
int maskStep,
|
||||
int maskoffset)
|
||||
{
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
x = x< cols ? x: cols-1;
|
||||
y = y< rows ? y: rows-1;
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
int maskidx = mad24(y,maskStep,x+ maskoffset);
|
||||
uchar mask = maskMat[maskidx];
|
||||
if (mask)
|
||||
{
|
||||
dstMat[dstidx] = scalar;
|
||||
}
|
||||
int x=get_global_id(0);
|
||||
int y=get_global_id(1);
|
||||
x = x< cols ? x: cols-1;
|
||||
y = y< rows ? y: rows-1;
|
||||
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
|
||||
int maskidx = mad24(y,maskStep,x+ maskoffset);
|
||||
uchar mask = maskMat[maskidx];
|
||||
if (mask)
|
||||
{
|
||||
dstMat[dstidx] = scalar;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -75,7 +75,7 @@ __kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcSt
|
||||
const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
|
||||
const uchar src_val1 = (src + y * srcStep)[x];
|
||||
const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
|
||||
|
||||
|
||||
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
|
||||
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
|
||||
}
|
||||
@@ -91,7 +91,7 @@ __kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcSt
|
||||
const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
|
||||
const uchar src_val1 = (src + y * srcStep)[x];
|
||||
const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
|
||||
|
||||
|
||||
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
|
||||
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
|
||||
}
|
||||
@@ -209,20 +209,20 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f
|
||||
smem3[tid] = val3;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
smem2[tid] = val2 += smem2[tid + 128];
|
||||
smem3[tid] = val3 += smem3[tid + 128];
|
||||
}
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
smem2[tid] = val2 += smem2[tid + 128];
|
||||
smem3[tid] = val3 += smem3[tid + 128];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 64)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 64];
|
||||
smem2[tid] = val2 += smem2[tid + 64];
|
||||
if (tid < 64)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 64];
|
||||
smem2[tid] = val2 += smem2[tid + 64];
|
||||
smem3[tid] = val3 += smem3[tid + 64];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 32)
|
||||
@@ -231,28 +231,28 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f
|
||||
volatile __local float* vmem2 = smem2;
|
||||
volatile __local float* vmem3 = smem3;
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 32];
|
||||
vmem2[tid] = val2 += vmem2[tid + 32];
|
||||
vmem1[tid] = val1 += vmem1[tid + 32];
|
||||
vmem2[tid] = val2 += vmem2[tid + 32];
|
||||
vmem3[tid] = val3 += vmem3[tid + 32];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 16];
|
||||
vmem2[tid] = val2 += vmem2[tid + 16];
|
||||
vmem1[tid] = val1 += vmem1[tid + 16];
|
||||
vmem2[tid] = val2 += vmem2[tid + 16];
|
||||
vmem3[tid] = val3 += vmem3[tid + 16];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 8];
|
||||
vmem2[tid] = val2 += vmem2[tid + 8];
|
||||
vmem1[tid] = val1 += vmem1[tid + 8];
|
||||
vmem2[tid] = val2 += vmem2[tid + 8];
|
||||
vmem3[tid] = val3 += vmem3[tid + 8];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 4];
|
||||
vmem2[tid] = val2 += vmem2[tid + 4];
|
||||
vmem1[tid] = val1 += vmem1[tid + 4];
|
||||
vmem2[tid] = val2 += vmem2[tid + 4];
|
||||
vmem3[tid] = val3 += vmem3[tid + 4];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 2];
|
||||
vmem2[tid] = val2 += vmem2[tid + 2];
|
||||
vmem1[tid] = val1 += vmem1[tid + 2];
|
||||
vmem2[tid] = val2 += vmem2[tid + 2];
|
||||
vmem3[tid] = val3 += vmem3[tid + 2];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 1];
|
||||
vmem2[tid] = val2 += vmem2[tid + 1];
|
||||
vmem1[tid] = val1 += vmem1[tid + 1];
|
||||
vmem2[tid] = val2 += vmem2[tid + 1];
|
||||
vmem3[tid] = val3 += vmem3[tid + 1];
|
||||
}
|
||||
}
|
||||
@@ -263,18 +263,18 @@ void reduce2(float val1, float val2, __local float* smem1, __local float* smem2,
|
||||
smem2[tid] = val2;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
smem2[tid] = val2 += smem2[tid + 128];
|
||||
}
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
smem2[tid] = val2 += smem2[tid + 128];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 64)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 64];
|
||||
smem2[tid] = val2 += smem2[tid + 64];
|
||||
}
|
||||
if (tid < 64)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 64];
|
||||
smem2[tid] = val2 += smem2[tid + 64];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 32)
|
||||
@@ -282,23 +282,23 @@ void reduce2(float val1, float val2, __local float* smem1, __local float* smem2,
|
||||
volatile __local float* vmem1 = smem1;
|
||||
volatile __local float* vmem2 = smem2;
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 32];
|
||||
vmem2[tid] = val2 += vmem2[tid + 32];
|
||||
vmem1[tid] = val1 += vmem1[tid + 32];
|
||||
vmem2[tid] = val2 += vmem2[tid + 32];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 16];
|
||||
vmem2[tid] = val2 += vmem2[tid + 16];
|
||||
vmem1[tid] = val1 += vmem1[tid + 16];
|
||||
vmem2[tid] = val2 += vmem2[tid + 16];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 8];
|
||||
vmem2[tid] = val2 += vmem2[tid + 8];
|
||||
vmem1[tid] = val1 += vmem1[tid + 8];
|
||||
vmem2[tid] = val2 += vmem2[tid + 8];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 4];
|
||||
vmem2[tid] = val2 += vmem2[tid + 4];
|
||||
vmem1[tid] = val1 += vmem1[tid + 4];
|
||||
vmem2[tid] = val2 += vmem2[tid + 4];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 2];
|
||||
vmem2[tid] = val2 += vmem2[tid + 2];
|
||||
vmem1[tid] = val1 += vmem1[tid + 2];
|
||||
vmem2[tid] = val2 += vmem2[tid + 2];
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 1];
|
||||
vmem2[tid] = val2 += vmem2[tid + 1];
|
||||
vmem1[tid] = val1 += vmem1[tid + 1];
|
||||
vmem2[tid] = val2 += vmem2[tid + 1];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -307,28 +307,28 @@ void reduce1(float val1, __local float* smem1, int tid)
|
||||
smem1[tid] = val1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
}
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 64)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 64];
|
||||
}
|
||||
if (tid < 64)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 64];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
volatile __local float* vmem1 = smem1;
|
||||
|
||||
vmem1[tid] = val1 += vmem1[tid + 32];
|
||||
vmem1[tid] = val1 += vmem1[tid + 16];
|
||||
vmem1[tid] = val1 += vmem1[tid + 8];
|
||||
vmem1[tid] = val1 += vmem1[tid + 32];
|
||||
vmem1[tid] = val1 += vmem1[tid + 16];
|
||||
vmem1[tid] = val1 += vmem1[tid + 8];
|
||||
vmem1[tid] = val1 += vmem1[tid + 4];
|
||||
vmem1[tid] = val1 += vmem1[tid + 2];
|
||||
vmem1[tid] = val1 += vmem1[tid + 1];
|
||||
vmem1[tid] = val1 += vmem1[tid + 2];
|
||||
vmem1[tid] = val1 += vmem1[tid + 1];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -344,8 +344,8 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
__local float smem2[256];
|
||||
__local float smem3[256];
|
||||
|
||||
int c_halfWin_x = (c_winSize_x - 1) / 2;
|
||||
int c_halfWin_y = (c_winSize_y - 1) / 2;
|
||||
int c_halfWin_x = (c_winSize_x - 1) / 2;
|
||||
int c_halfWin_y = (c_winSize_y - 1) / 2;
|
||||
|
||||
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
|
||||
|
||||
@@ -359,18 +359,18 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
{
|
||||
status[get_group_id(0)] = 0;
|
||||
|
||||
//if (calcErr)
|
||||
//if (calcErr)
|
||||
// err[get_group_id(0)] = 0;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
prevPt.x -= c_halfWin_x;
|
||||
prevPt.y -= c_halfWin_y;
|
||||
|
||||
|
||||
// extract the patch from the first image, compute covariation matrix of derivatives
|
||||
|
||||
|
||||
float A11 = 0;
|
||||
float A12 = 0;
|
||||
float A22 = 0;
|
||||
@@ -380,14 +380,14 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
float dIdy_patch[21][21];
|
||||
|
||||
for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i)
|
||||
{
|
||||
{
|
||||
for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j)
|
||||
{
|
||||
float x = (prevPt.x + xBase + 0.5f);
|
||||
float y = (prevPt.y + yBase + 0.5f);
|
||||
|
||||
I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x;
|
||||
|
||||
|
||||
float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
|
||||
(3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
|
||||
|
||||
@@ -396,7 +396,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
|
||||
dIdx_patch[i][j] = dIdx;
|
||||
dIdy_patch[i][j] = dIdy;
|
||||
|
||||
|
||||
A11 += dIdx * dIdx;
|
||||
A12 += dIdx * dIdy;
|
||||
A22 += dIdy * dIdy;
|
||||
@@ -409,10 +409,10 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
A11 = smem1[0];
|
||||
A12 = smem2[0];
|
||||
A22 = smem3[0];
|
||||
|
||||
|
||||
float D = A11 * A22 - A12 * A12;
|
||||
|
||||
//if (calcErr && GET_MIN_EIGENVALS && tid == 0)
|
||||
//if (calcErr && GET_MIN_EIGENVALS && tid == 0)
|
||||
// err[get_group_id(0)] = minEig;
|
||||
|
||||
if (D < 1.192092896e-07f)
|
||||
@@ -431,8 +431,8 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
|
||||
float2 nextPt = nextPts[get_group_id(0)];
|
||||
nextPt.x *= 2.0f;
|
||||
nextPt.y *= 2.0f;
|
||||
|
||||
nextPt.y *= 2.0f;
|
||||
|
||||
nextPt.x -= c_halfWin_x;
|
||||
nextPt.y -= c_halfWin_y;
|
||||
|
||||
@@ -447,14 +447,14 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
|
||||
float b1 = 0;
|
||||
float b2 = 0;
|
||||
|
||||
|
||||
for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
|
||||
{
|
||||
for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
|
||||
{
|
||||
float a = (nextPt.x + x + 0.5f);
|
||||
float b = (nextPt.y + y + 0.5f);
|
||||
|
||||
float a = (nextPt.x + x + 0.5f);
|
||||
float b = (nextPt.y + y + 0.5f);
|
||||
|
||||
float I_val = I_patch[i][j];
|
||||
float J_val = read_imagef(J, sampler, (float2)(a, b)).x;
|
||||
|
||||
@@ -464,7 +464,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
b2 += diff * dIdy_patch[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
reduce2(b1, b2, smem1, smem2, tid);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -474,7 +474,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
float2 delta;
|
||||
delta.x = A12 * b2 - A22 * b1;
|
||||
delta.y = A12 * b1 - A11 * b2;
|
||||
|
||||
|
||||
nextPt.x += delta.x;
|
||||
nextPt.y += delta.y;
|
||||
|
||||
@@ -489,9 +489,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
{
|
||||
for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
|
||||
{
|
||||
float a = (nextPt.x + x + 0.5f);
|
||||
float b = (nextPt.y + y + 0.5f);
|
||||
|
||||
float a = (nextPt.x + x + 0.5f);
|
||||
float b = (nextPt.y + y + 0.5f);
|
||||
|
||||
float I_val = I_patch[i][j];
|
||||
float J_val = read_imagef(J, sampler, (float2)(a, b)).x;
|
||||
|
||||
@@ -522,8 +522,8 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
__local float smem2[256];
|
||||
__local float smem3[256];
|
||||
|
||||
int c_halfWin_x = (c_winSize_x - 1) / 2;
|
||||
int c_halfWin_y = (c_winSize_y - 1) / 2;
|
||||
int c_halfWin_x = (c_winSize_x - 1) / 2;
|
||||
int c_halfWin_y = (c_winSize_y - 1) / 2;
|
||||
|
||||
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
|
||||
|
||||
@@ -537,18 +537,18 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
{
|
||||
status[get_group_id(0)] = 0;
|
||||
|
||||
//if (calcErr)
|
||||
//if (calcErr)
|
||||
// err[get_group_id(0)] = 0;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
prevPt.x -= c_halfWin_x;
|
||||
prevPt.y -= c_halfWin_y;
|
||||
|
||||
|
||||
// extract the patch from the first image, compute covariation matrix of derivatives
|
||||
|
||||
|
||||
float A11 = 0;
|
||||
float A12 = 0;
|
||||
float A22 = 0;
|
||||
@@ -558,14 +558,14 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
float4 dIdy_patch[21][21];
|
||||
|
||||
for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i)
|
||||
{
|
||||
{
|
||||
for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j)
|
||||
{
|
||||
float x = (prevPt.x + xBase + 0.5f);
|
||||
float y = (prevPt.y + yBase + 0.5f);
|
||||
|
||||
I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x;
|
||||
|
||||
|
||||
float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
|
||||
(3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
|
||||
|
||||
@@ -574,7 +574,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
|
||||
dIdx_patch[i][j] = dIdx;
|
||||
dIdy_patch[i][j] = dIdy;
|
||||
|
||||
|
||||
A11 += (dIdx * dIdx).x + (dIdx * dIdx).y + (dIdx * dIdx).z;
|
||||
A12 += (dIdx * dIdy).x + (dIdx * dIdy).y + (dIdx * dIdy).z;
|
||||
A22 += (dIdy * dIdy).x + (dIdy * dIdy).y + (dIdy * dIdy).z;
|
||||
@@ -587,10 +587,10 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
A11 = smem1[0];
|
||||
A12 = smem2[0];
|
||||
A22 = smem3[0];
|
||||
|
||||
|
||||
float D = A11 * A22 - A12 * A12;
|
||||
|
||||
//if (calcErr && GET_MIN_EIGENVALS && tid == 0)
|
||||
//if (calcErr && GET_MIN_EIGENVALS && tid == 0)
|
||||
// err[get_group_id(0)] = minEig;
|
||||
|
||||
if (D < 1.192092896e-07f)
|
||||
@@ -609,8 +609,8 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
|
||||
float2 nextPt = nextPts[get_group_id(0)];
|
||||
nextPt.x *= 2.0f;
|
||||
nextPt.y *= 2.0f;
|
||||
|
||||
nextPt.y *= 2.0f;
|
||||
|
||||
nextPt.x -= c_halfWin_x;
|
||||
nextPt.y -= c_halfWin_y;
|
||||
|
||||
@@ -625,14 +625,14 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
|
||||
float b1 = 0;
|
||||
float b2 = 0;
|
||||
|
||||
|
||||
for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
|
||||
{
|
||||
for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
|
||||
{
|
||||
float a = (nextPt.x + x + 0.5f);
|
||||
float b = (nextPt.y + y + 0.5f);
|
||||
|
||||
float a = (nextPt.x + x + 0.5f);
|
||||
float b = (nextPt.y + y + 0.5f);
|
||||
|
||||
float4 I_val = I_patch[i][j];
|
||||
float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x;
|
||||
|
||||
@@ -642,7 +642,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
b2 += (diff * dIdy_patch[i][j]).x + (diff * dIdy_patch[i][j]).y + (diff * dIdy_patch[i][j]).z;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
reduce2(b1, b2, smem1, smem2, tid);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -652,7 +652,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
float2 delta;
|
||||
delta.x = A12 * b2 - A22 * b1;
|
||||
delta.y = A12 * b1 - A11 * b2;
|
||||
|
||||
|
||||
nextPt.x += delta.x;
|
||||
nextPt.y += delta.y;
|
||||
|
||||
@@ -667,9 +667,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
{
|
||||
for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
|
||||
{
|
||||
float a = (nextPt.x + x + 0.5f);
|
||||
float b = (nextPt.y + y + 0.5f);
|
||||
|
||||
float a = (nextPt.x + x + 0.5f);
|
||||
float b = (nextPt.y + y + 0.5f);
|
||||
|
||||
float4 I_val = I_patch[i][j];
|
||||
float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x;
|
||||
|
||||
@@ -694,11 +694,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
|
||||
__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
|
||||
const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
|
||||
{
|
||||
int c_halfWin_x = (c_winSize_x - 1) / 2;
|
||||
int c_halfWin_y = (c_winSize_y - 1) / 2;
|
||||
int c_halfWin_x = (c_winSize_x - 1) / 2;
|
||||
int c_halfWin_y = (c_winSize_y - 1) / 2;
|
||||
|
||||
const int patchWidth = get_local_size(0) + 2 * c_halfWin_x;
|
||||
const int patchHeight = get_local_size(1) + 2 * c_halfWin_y;
|
||||
@@ -712,8 +712,8 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
|
||||
const int xBase = get_group_id(0) * get_local_size(0);
|
||||
const int yBase = get_group_id(1) * get_local_size(1);
|
||||
|
||||
sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
|
||||
sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
|
||||
for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1))
|
||||
{
|
||||
for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0))
|
||||
@@ -735,7 +735,7 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// extract the patch from the first image, compute covariation matrix of derivatives
|
||||
|
||||
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
@@ -747,24 +747,24 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
|
||||
int A22i = 0;
|
||||
|
||||
for (int i = 0; i < c_winSize_y; ++i)
|
||||
{
|
||||
{
|
||||
for (int j = 0; j < c_winSize_x; ++j)
|
||||
{
|
||||
int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
|
||||
int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
|
||||
|
||||
|
||||
A11i += dIdx * dIdx;
|
||||
A12i += dIdx * dIdy;
|
||||
A22i += dIdy * dIdy;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
float A11 = A11i;
|
||||
float A12 = A12i;
|
||||
float A22 = A22i;
|
||||
|
||||
float D = A11 * A22 - A12 * A12;
|
||||
|
||||
|
||||
//if (calcErr && GET_MIN_EIGENVALS)
|
||||
// (err + y * errStep)[x] = minEig;
|
||||
|
||||
@@ -819,7 +819,7 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
|
||||
float2 delta;
|
||||
delta.x = A12 * b2 - A22 * b1;
|
||||
delta.y = A12 * b1 - A11 * b2;
|
||||
|
||||
|
||||
nextPt.x += delta.x;
|
||||
nextPt.y += delta.y;
|
||||
|
||||
|
||||
@@ -51,9 +51,9 @@
|
||||
////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global uchar *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
||||
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
|
||||
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
|
||||
|
||||
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
|
||||
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
|
||||
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
|
||||
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
|
||||
|
||||
int total_bytes = src_offset + rows * src_step;
|
||||
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
|
||||
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
|
||||
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
|
||||
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
|
||||
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
|
||||
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
|
||||
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
|
||||
|
||||
int total_bytes = src_offset + rows * src_step;
|
||||
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
|
||||
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
|
||||
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
|
||||
|
||||
uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
|
||||
|
||||
@@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
||||
|
||||
|
||||
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
|
||||
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
|
||||
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
|
||||
@@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s
|
||||
|
||||
uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
|
||||
int index = 3 - dst0_offset & 3;
|
||||
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
||||
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
||||
|
||||
uchar4 data0, data1, data2;
|
||||
|
||||
|
||||
data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
|
||||
data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
|
||||
data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
|
||||
@@ -263,31 +263,31 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global uchar *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global uchar *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst0_align ((dst0_offset & 3) << 1)
|
||||
#define dst1_align ((dst1_offset & 3) << 1)
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
|
||||
uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
|
||||
uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
|
||||
|
||||
@@ -312,9 +312,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global char *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@@ -322,35 +322,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
||||
int src_idx = mad24(y, src_step, src_offset + (x << 2));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
|
||||
|
||||
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
|
||||
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
|
||||
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
|
||||
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
|
||||
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
|
||||
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
|
||||
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
|
||||
|
||||
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
|
||||
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
|
||||
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
|
||||
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
|
||||
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
|
||||
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
|
||||
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
|
||||
|
||||
char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
|
||||
|
||||
@@ -423,33 +423,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
|
||||
|
||||
|
||||
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
|
||||
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
|
||||
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
|
||||
@@ -486,10 +486,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr
|
||||
|
||||
char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
|
||||
int index = 3 - dst0_offset & 3;
|
||||
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
||||
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
|
||||
|
||||
char4 data0, data1, data2;
|
||||
|
||||
|
||||
data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
|
||||
data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
|
||||
data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
|
||||
@@ -522,31 +522,31 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global char *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global char *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst0_align ((dst0_offset & 3) << 1)
|
||||
#define dst1_align ((dst1_offset & 3) << 1)
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
|
||||
|
||||
|
||||
char8 src_data_0 = vload8(0, mat_src + src_idx_0);
|
||||
char8 src_data_1 = vload8(0, mat_src + src_idx_1);
|
||||
|
||||
@@ -571,9 +571,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int sr
|
||||
}
|
||||
|
||||
__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global ushort *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@@ -581,29 +581,29 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
||||
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
||||
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
||||
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
|
||||
ushort8 src_data0 = vload8(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
|
||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
|
||||
|
||||
@@ -639,33 +639,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
|
||||
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
|
||||
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
|
||||
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
|
||||
@@ -702,31 +702,31 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global ushort *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global ushort *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst0_align ((dst0_offset & 3) << 1)
|
||||
#define dst1_align ((dst1_offset & 3) << 1)
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
|
||||
ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
|
||||
ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_1));
|
||||
|
||||
@@ -746,9 +746,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int
|
||||
}
|
||||
}
|
||||
__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global short *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@@ -756,29 +756,29 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
||||
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
||||
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
|
||||
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_start = mad24(y, dst3_step, dst3_offset);
|
||||
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
|
||||
short8 src_data0 = vload8(0, (__global short *)((__global char *)mat_src + src_idx_0));
|
||||
short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
|
||||
|
||||
@@ -813,33 +813,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s
|
||||
}
|
||||
}
|
||||
__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_start = mad24(y, dst2_step, dst2_offset);
|
||||
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
|
||||
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
|
||||
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
|
||||
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
|
||||
@@ -877,31 +877,31 @@ __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int s
|
||||
|
||||
|
||||
__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global short *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global short *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst0_align ((dst0_offset & 3) << 1)
|
||||
#define dst1_align ((dst1_offset & 3) << 1)
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
||||
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
|
||||
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
|
||||
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_start = mad24(y, dst0_step, dst0_offset);
|
||||
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_start = mad24(y, dst1_step, dst1_offset);
|
||||
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
|
||||
short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
|
||||
short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
|
||||
|
||||
@@ -921,9 +921,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int s
|
||||
}
|
||||
}
|
||||
__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global int *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@@ -931,14 +931,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
||||
|
||||
|
||||
int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@@ -948,18 +948,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src
|
||||
}
|
||||
}
|
||||
__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
@@ -975,20 +975,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global int *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global int *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
|
||||
|
||||
int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@@ -997,9 +997,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src
|
||||
}
|
||||
|
||||
__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global float *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@@ -1007,14 +1007,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
||||
|
||||
|
||||
float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@@ -1025,18 +1025,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
@@ -1052,20 +1052,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int s
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global float *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global float *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
|
||||
|
||||
float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@@ -1075,9 +1075,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int s
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global double *mat_dst3, int dst3_step, int dst3_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
@@ -1085,14 +1085,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
int dst3_idx = mad24(y, dst3_step, dst3_offset);
|
||||
|
||||
|
||||
double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
@@ -1103,18 +1103,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int
|
||||
}
|
||||
|
||||
__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst2, int dst2_step, int dst2_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
int dst2_idx = mad24(y, dst2_step, dst2_offset);
|
||||
@@ -1130,20 +1130,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int
|
||||
}
|
||||
|
||||
__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
__global double *mat_dst0, int dst0_step, int dst0_offset,
|
||||
__global double *mat_dst1, int dst1_step, int dst1_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < cols) && (y < rows))
|
||||
if((x < cols) && (y < rows))
|
||||
{
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int src_idx = mad24(y, src_step, src_offset);
|
||||
int dst0_idx = mad24(y, dst0_step, dst0_offset);
|
||||
int dst1_idx = mad24(y, dst1_step, dst1_offset);
|
||||
|
||||
|
||||
double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
|
||||
|
||||
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
|
||||
|
||||
Reference in New Issue
Block a user