format files to ANSI C style with coolformat

change the download channels to oclchannles()
fix bugs of arithm functions
perf fix of bilateral
bug fix of split test case
add build_warps functions
This commit is contained in:
niko
2012-10-11 16:22:47 +08:00
parent 69fbc6102c
commit 97156897b2
78 changed files with 15433 additions and 12118 deletions

View File

@@ -70,9 +70,22 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = vload4(0, src2 + src2_index);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = abs_diff(src1_data, src2_data);
@@ -242,9 +255,15 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));

View File

@@ -71,10 +71,22 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = vload4(0, src2 + src2_index);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
uchar4 tmp_data = convert_uchar4_sat(tmp);
@@ -248,11 +260,31 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = vload4(0, src2 + src2_index);
uchar4 mask_data = vload4(0, mask + mask_index);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
uchar4 mask_data = vload4(0, mask + mask_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(mask_index < 0)
{
uchar4 tmp;
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
}
uchar4 data = *((__global uchar4 *)(dst + dst_index));
short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data);
uchar4 tmp_data = convert_uchar4_sat(tmp);

View File

@@ -65,10 +65,16 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
uchar4 data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) + src2_data;
uchar4 tmp_data = convert_uchar4_sat(tmp);

View File

@@ -68,10 +68,23 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int mask_index_fix = mask_index < 0 ? 0 : mask_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
uchar4 mask_data = vload4(0, mask + mask_index);
uchar4 mask_data = vload4(0, mask + mask_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(mask_index < 0)
{
uchar4 tmp;
tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
}
uchar4 data = *((__global uchar4 *)(dst + dst_index));
int4 tmp = convert_int4_sat(src1_data) + src2_data;

View File

@@ -71,9 +71,22 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
int dst_end_1 = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
int dst_index_0 = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int dst_index_1 = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src_data_0 = vload4(0, src + src_index_0);
uchar4 src_data_1 = vload4(0, src + src_index_1);
int src1_index_fix = src_index_0 < 0 ? 0 : src_index_0;
int src2_index_fix = src_index_1 < 0 ? 0 : src_index_1;
uchar4 src_data_0 = vload4(0, src + src1_index_fix);
uchar4 src_data_1 = vload4(0, src + src2_index_fix);
if(src_index_0 < 0)
{
uchar4 tmp;
tmp.xyzw = (src_index_0 == -2) ? src_data_0.zwxy:src_data_0.yzwx;
src_data_0.xyzw = (src_index_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
}
if(src_index_1 < 0)
{
uchar4 tmp;
tmp.xyzw = (src_index_1 == -2) ? src_data_1.zwxy:src_data_1.yzwx;
src_data_1.xyzw = (src_index_1 == -1) ? src_data_1.wxyz:tmp.xyzw;
}
uchar4 dst_data_0 = *((__global uchar4 *)(dst + dst_index_0));
uchar4 dst_data_1 = *((__global uchar4 *)(dst + dst_index_1));

View File

@@ -0,0 +1,237 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
__kernel
void buildWarpPlaneMaps
(
__global float * map_x,
__global float * map_y,
__constant float * KRT,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
__constant float * ck_rinv = KRT;
__constant float * ct = KRT + 9;
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
float x_ = u / scale - ct[0];
float y_ = v / scale - ct[1];
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
x /= z;
y /= z;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpCylindricalMaps
(
__global float * map_x,
__global float * map_y,
__constant float * ck_rinv,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
u /= scale;
float x_ = sin(u);
float y_ = v / scale;
float z_ = cos(u);
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
if (z > 0) { x /= z; y /= z; }
else x = y = -1;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpSphericalMaps
(
__global float * map_x,
__global float * map_y,
__constant float * ck_rinv,
int tl_u,
int tl_v,
int cols,
int rows,
int step_x,
int step_y,
float scale
)
{
int du = get_global_id(0);
int dv = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
v /= scale;
u /= scale;
float sinv = sin(v);
float x_ = sinv * sin(u);
float y_ = - cos(v);
float z_ = sinv * cos(u);
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
if (z > 0) { x /= z; y /= z; }
else x = y = -1;
map_x[dv * step_x + du] = x;
map_y[dv * step_y + du] = y;
}
}
__kernel
void buildWarpAffineMaps
(
__global float * xmap,
__global float * ymap,
__constant float * c_warpMat,
int cols,
int rows,
int step_x,
int step_y
)
{
int x = get_global_id(0);
int y = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (x < cols && y < rows)
{
const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
map_x[y * step_x + x] = xcoo;
map_y[y * step_y + x] = ycoo;
}
}
__kernel
void buildWarpPerspectiveMaps
(
__global float * xmap,
__global float * ymap,
__constant float * c_warpMat,
int cols,
int rows,
int step_x,
int step_y
)
{
int x = get_global_id(0);
int y = get_global_id(1);
step_x /= sizeof(float);
step_y /= sizeof(float);
if (x < cols && y < rows)
{
const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
map_x[y * step_x + x] = xcoo;
map_y[y * step_y + x] = ycoo;
}
}

View File

@@ -254,7 +254,8 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
//ss = convert_uint4(src[cur_addr]);
int cur_col = clamp(startX + col, 0, src_whole_cols);
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);
if(con)
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);
data[i] = con ? ss : 0;
}
@@ -269,6 +270,7 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
}
@@ -334,11 +336,12 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
// int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
// ss = src[cur_addr];
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
//ss = src[cur_addr];
int cur_col = clamp(startX + col, 0, src_whole_cols);
ss = src[(startY+i)*(src_step>>2) + cur_col];
//ss = src[(startY+i)*(src_step>>2) + cur_col];
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:0;
data[i] = con ? ss : 0.f;
}
@@ -422,7 +425,8 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
//ss = src[cur_addr];
int cur_col = clamp(startX + col, 0, src_whole_cols);
ss = src[(startY+i)*(src_step>>4) + cur_col];
//ss = src[(startY+i)*(src_step>>4) + cur_col];
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:0;
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
}

View File

@@ -31,84 +31,8 @@
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
//#pragma OPENCL EXTENSION cl_amd_printf :enable
__kernel
void bilateral4(__global uchar4 *dst,
__global uchar4 *src,
int rows,
int cols,
int channels,
int radius,
int wholerows,
int wholecols,
int src_step,
int dst_step,
int src_offset,
int dst_offset,
__constant float *sigClr,
__constant float *sigSpc)
{
uint lidx = get_local_id(0);
uint lidy = get_local_id(1);
uint gdx = get_global_id(0);
uint gdy = get_global_id(1);
uint gidx = gdx >=cols?cols-1:gdx;
uint gidy = gdy >=rows?rows-1:gdy;
uchar4 p,q,tmp;
float4 pf = 0,pq = 0,pd = 0;
float wt =0;
int r = radius;
int ij = 0;
int ct = 0;
uint index_src = src_offset/4 + gidy*src_step/4 + gidx;
uint index_dst = dst_offset/4 + gidy*dst_step/4 + gidx;
p = src[index_src];
uint gx,gy;
uint src_index,dst_index;
for(int ii = -r;ii<r+1;ii++)
{
for(int jj =-r;jj<r+1;jj++)
{
ij = ii*ii+jj*jj;
if(ij > mul24(radius,radius)) continue;
gx = gidx + jj;
gy = gidy + ii;
src_index = src_offset/4 + gy * src_step/4 + gx;
q = src[src_index];
ct = abs(p.x-q.x)+abs(p.y-q.y)+abs(p.z-q.z);
wt =sigClr[ct]*sigSpc[(ii+radius)*(2*radius+1)+jj+radius];
pf.x += q.x*wt;
pf.y += q.y*wt;
pf.z += q.z*wt;
// pf.w += q.w*wt;
pq += wt;
}
}
pd = pf/pq;
dst[index_dst] = convert_uchar4_rte(pd);
}
__kernel void bilateral(__global uchar *dst,
__kernel void bilateral_C1_D0(__global uchar *dst,
__global const uchar *src,
const int dst_rows,
const int dst_cols,
@@ -128,8 +52,8 @@ __kernel void bilateral(__global uchar *dst,
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,src_step,gidx+dst_offset);
float sum = 0, wsum = 0;
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float sum = 0.f, wsum = 0.f;
int val0 = (int)src[src_addr];
for(int k = 0; k < maxk; k++ )
@@ -142,4 +66,73 @@ __kernel void bilateral(__global uchar *dst,
dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f);
}
}
__kernel void bilateral2_C1_D0(__global uchar *dst,
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float4 sum = (float4)(0.f), wsum = (float4)(0.f);
int4 val0 = convert_int4(vload4(0,src+src_addr));
for(int k = 0; k < maxk; k++ )
{
int4 val = convert_int4(vload4(0,src+src_addr + space_ofs[k]));
float4 w = (float4)(space_weight[k])*(float4)(color_weight[abs(val.x - val0.x)],color_weight[abs(val.y - val0.y)],color_weight[abs(val.z - val0.z)],color_weight[abs(val.w - val0.w)]);
sum += convert_float4(val)*w;
wsum += w;
}
*(__global uchar4*)(dst+dst_addr) = convert_uchar4_rtz(sum/wsum+0.5f);
}
}
__kernel void bilateral_C4_D0(__global uchar4 *dst,
__global const uchar4 *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if((gidy<dst_rows) && (gidx<dst_cols))
{
int src_addr = mad24(gidy+radius,src_step,gidx+radius);
int dst_addr = mad24(gidy,dst_step,gidx+dst_offset);
float4 sum = (float4)0.f;
float wsum = 0.f;
int4 val0 = convert_int4(src[src_addr]);
for(int k = 0; k < maxk; k++ )
{
int4 val = convert_int4(src[src_addr + space_ofs[k]]);
float w = space_weight[k]*color_weight[abs(val.x - val0.x)+abs(val.y - val0.y)+abs(val.z - val0.z)];
sum += convert_float4(val)*(float4)w;
wsum += w;
}
wsum=1.f/wsum;
dst[dst_addr] = convert_uchar4_rtz(sum*(float4)wsum+(float4)0.5f);
}
}

View File

@@ -144,16 +144,18 @@ __kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))c
int rowIndex = mad24(gy, gn, gx);
// rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1);
__local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE + 1];
__local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE];
subhist[lidy] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
gidx = ((gidx>=left_col) ? (gidx+cols) : gidx);
int src_index = src_offset + mad24(gidy, src_step, gidx);
barrier(CLK_LOCAL_MEM_FENCE);
int p = (int)src[src_index];
p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p;
atomic_inc(subhist + p);
if(gidy<rows)
{
int src_index = src_offset + mad24(gidy, src_step, gidx);
int p = (int)src[src_index];
// p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p;
atomic_inc(subhist + p);
}
barrier(CLK_LOCAL_MEM_FENCE);
globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];

View File

@@ -0,0 +1,252 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
// atomic add for 32bit floating point
inline void atomic_addf(volatile __global float *source, const float operand) {
union {
unsigned int intVal;
float floatVal;
} newVal;
union {
unsigned int intVal;
float floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
__kernel void memsetKernel(
float val,
__global float * image,
int width,
int height,
int step, // in element
int offset
)
{
if(get_global_id(0) >= width || get_global_id(1) >= height)
{
return;
}
image += offset;
image[get_global_id(0) + get_global_id(1) * step] = val;
}
__kernel void normalizeKernel(
__global float * buffer,
int width,
int height,
int step,
int f_offset,
int d_offset
)
{
__global float * factors = buffer + f_offset;
__global float * dst = buffer + d_offset;
int j = get_global_id(0);
int i = get_global_id(1);
if(j >= width || i >= height)
{
return;
}
float scale = factors[step * i + j];
float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
dst[step * i + j] *= invScale;
}
__kernel void forwardWarpKernel(
__global const float * src,
__global float * buffer,
__global const float * u,
__global const float * v,
const int w,
const int h,
const int flow_stride,
const int image_stride,
const int factor_offset,
const int dst_offset,
const float time_scale
)
{
int j = get_global_id(0);
int i = get_global_id(1);
if (i >= h || j >= w) return;
volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
int flow_row_offset = i * flow_stride;
int image_row_offset = i * image_stride;
//bottom left corner of a target pixel
float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
// pixel containing bottom left corner
float px;
float py;
float dx = modf(cx, &px);
float dy = modf(cy, &py);
// target pixel integer coords
int tx;
int ty;
tx = (int) px;
ty = (int) py;
float value = src[image_row_offset + j];
float weight;
// fill pixel containing bottom right corner
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = dx * dy;
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing bottom left corner
tx -= 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = (1.0f - dx) * dy;
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing upper left corner
ty -= 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = (1.0f - dx) * (1.0f - dy);
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing upper right corner
tx += 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = dx * (1.0f - dy);
atomic_addf(dst + ty * image_stride + tx, value * weight);
atomic_addf(normalization_factor + ty * image_stride + tx, weight);
}
}
// define buffer offsets
enum
{
O0_OS = 0,
O1_OS,
U_OS,
V_OS,
UR_OS,
VR_OS
};
__kernel void blendFramesKernel(
image2d_t tex_src0,
image2d_t tex_src1,
__global float * buffer,
__global float * out,
int w,
int h,
int step,
float theta
)
{
__global float * u = buffer + h * step * U_OS;
__global float * v = buffer + h * step * V_OS;
__global float * ur = buffer + h * step * UR_OS;
__global float * vr = buffer + h * step * VR_OS;
__global float * o0 = buffer + h * step * O0_OS;
__global float * o1 = buffer + h * step * O1_OS;
int ix = get_global_id(0);
int iy = get_global_id(1);
if(ix >= w || iy >= h) return;
int pos = ix + step * iy;
float _u = u[pos];
float _v = v[pos];
float _ur = ur[pos];
float _vr = vr[pos];
float x = (float)ix + 0.5f;
float y = (float)iy + 0.5f;
bool b0 = o0[pos] > 1e-4f;
bool b1 = o1[pos] > 1e-4f;
float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
if (b0 && b1)
{
// pixel is visible on both frames
out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) +
read_imagef(tex_src1, sampler, coord1).x * theta;
}
else if (b0)
{
// visible on the first frame only
out[pos] = read_imagef(tex_src0, sampler, coord0).x;
}
else
{
// visible on the second frame only
out[pos] = read_imagef(tex_src1, sampler, coord1).x;
}
}