rename all the perf test files

fix the channel 3 bug in matrix operation
perf and buf fix for LUT haardetect convertC3C4 resize warpaffine copytom settom
add convovle
remove stereo
This commit is contained in:
niko
2012-09-12 10:20:04 +08:00
parent e94cd1ec72
commit 23244a3565
50 changed files with 2080 additions and 5377 deletions

View File

@@ -125,38 +125,38 @@ __kernel
void LUT_C4_D0( __global uchar4 *dst,
__global uchar4 *src,
__constant uchar *table,
uint rows,
uint cols,
uint channels,
uint whole_rows,
uint whole_cols,
uint src_offset,
uint dst_offset,
uint lut_offset,
uint src_step,
uint dst_step)
int rows,
int cols,
int channels,
int whole_rows,
int whole_cols,
int src_offset,
int dst_offset,
int lut_offset,
int src_step,
int dst_step)
{
uint gidx = get_global_id(0);
uint gidy = get_global_id(1);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
uint lidx = get_local_id(0);
uint lidy = get_local_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int src_index = mad24(gidy,src_step,gidx+src_offset);
int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
__local uchar l[256];
l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
mem_fence(CLK_LOCAL_MEM_FENCE);
//mem_fence(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
gidx = gidx >= cols?cols-1:gidx;
gidy = gidy >= rows?rows-1:gidy;
uint src_index = src_offset/4 + gidy * src_step/4 + gidx;
uint dst_index = dst_offset/4 + gidy * dst_step/4 + gidx;
uchar4 p = src[src_index];
dst[dst_index].x = l[p.x];
dst[dst_index].y = l[p.y];
dst[dst_index].z = l[p.z];
dst[dst_index].w = l[p.w];
if(gidx<cols && gidy<rows)
{
uchar4 p = src[src_index];
uchar4 q;
q.x = l[p.x];
q.y = l[p.y];
q.z = l[p.z];
q.w = l[p.w];
dst[dst_index] = q;
}
}

View File

@@ -33,13 +33,13 @@
//
//
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#define WORKGROUPSIZE 256
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int dstStep_in_piexl,int pixel_end)
{
int id = get_global_id(0);
//read data from source
//int pixel_end = mul24(cols -1 , rows -1);
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
pixelid = clamp(pixelid,0,pixel_end);
@@ -54,36 +54,19 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
//permutate the data in LDS to avoid global memory conflict
__local GENTYPE4 rearrange[WORKGROUPSIZE*4];
int lid = get_local_id(0)<<2;
rearrange[lid++] = outpix0;
rearrange[lid++] = outpix1;
rearrange[lid++] = outpix2;
rearrange[lid] = outpix3;
lid = get_local_id(0);
barrier(CLK_LOCAL_MEM_FENCE);
outpix0 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpix1 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpix2 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpix3 = rearrange[lid];
//calculate output index
int4 outx, outy;
int4 startid = mad24((int)get_group_id(0),WORKGROUPSIZE*4,(int)get_local_id(0));
startid.y+=WORKGROUPSIZE;
startid.z+=WORKGROUPSIZE*2;
startid.w+=WORKGROUPSIZE*3;
outx = startid%(int4)cols;
outy = startid/(int4)cols;
int4 addr = mad24(outy,dstStep_in_piexl,outx);
int4 outy = (id<<2)/cols;
int4 outx = (id<<2)%cols;
outx.y++;
outx.z+=2;
outx.w+=3;
outy = select(outy,outy+1,outx>=cols);
outx = select(outx,outx-cols,outx>=cols);
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
//outx = select(outx,(int4)outx.x,outy>=rows);
//outy = select(outy,(int4)outy.x,outy>=rows);
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
if(outx.w<cols && outy.w<rows)
{
dst[addr.x] = outpix0;
@@ -119,10 +102,10 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY
int x = id % cols;
int4 x4 = (int4)(x,x+1,x+2,x+3);
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
y4=clamp(y4,(int4)0,(int4)(rows-1));
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
//read data from src
pixel0 = src[addr.x];
pixel1 = src[addr.y];
pixel2 = src[addr.z];
@@ -137,40 +120,23 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY
outpixel2.y = pixel3.x;
outpixel2.z = pixel3.y;
outpixel2.w = pixel3.z;
//permutate the data in LDS to avoid global memory conflict
__local GENTYPE4 rearrange[WORKGROUPSIZE*3];
int lid = mul24((int)get_local_id(0),3);
rearrange[lid++] = pixel0;
rearrange[lid++] = outpixel1;
rearrange[lid] = outpixel2;
barrier(CLK_LOCAL_MEM_FENCE);
lid = get_local_id(0);
pixel0 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpixel1 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpixel2 = rearrange[lid];
//calcultate output index
int3 startid = mad24((int)get_group_id(0),WORKGROUPSIZE*3,(int)get_local_id(0));
startid.y+=WORKGROUPSIZE;
startid.z+=WORKGROUPSIZE*2;
//id = mul24(id>>2 , 3);
if(startid.z <= pixel_end)
int4 outaddr = mul24(id>>2 , 3);
outaddr.y++;
outaddr.z+=2;
//printf("%d ",outaddr.z);
if(outaddr.z <= pixel_end)
{
dst[startid.x] = pixel0;
dst[startid.y] = outpixel1;
dst[startid.z] = outpixel2;
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
dst[outaddr.z] = outpixel2;
}
else if(startid.y <= pixel_end)
else if(outaddr.y <= pixel_end)
{
dst[startid.x] = pixel0;
dst[startid.y] = outpixel1;
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
}
else if(startid.x <= pixel_end)
else if(outaddr.x <= pixel_end)
{
dst[startid.x] = pixel0;
}
dst[outaddr.x] = pixel0;
}
}

View File

@@ -87,6 +87,7 @@ The length of the convovle kernel supported is only related to the MAX size of L
which is HW related.
Niko
6/29/2011
The info above maybe obsolete.
***********************************************************************************/

View File

@@ -92,6 +92,7 @@ For channels = 2, the RADIUS is no more than LSIZE0
For channels = 4, arbitary RADIUS is supported unless the LDS is not enough
Niko
6/29/2011
The info above maybe obsolete.
***********************************************************************************/
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0

View File

@@ -302,7 +302,9 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
nodecounter = splitnode;
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
{
lclcount[0]=0;
//barrier(CLK_LOCAL_MEM_FENCE);
//if(lcl_id == 0)
lclcount[0]=0;
barrier(CLK_LOCAL_MEM_FENCE);
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
@@ -314,14 +316,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0;queueloop<queuecount_loop && lcl_compute_win_id < queuecount;queueloop++)
for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
int tempnodecounter = lcl_compute_id;
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount) {
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
{
@@ -353,10 +358,12 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
//}
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter+=lcl_compute_win;
tempnodecounter +=lcl_compute_win;
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount) {
for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
{
stage_sum += partialsum[lcl_id+i];
@@ -368,11 +375,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
lcl_compute_win_id +=(1<<perfscale);
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
barrier(CLK_LOCAL_MEM_FENCE);
queuecount = lclcount[0];
nodecounter += stageinfo.x;
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_id<queuecount)
{
int temp = lcloutindex[lcl_id<<1];

View File

@@ -0,0 +1,111 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
/************************************** convolve **************************************/
__kernel void convolve_D5 (__global float *src, __global float *temp1, __global float *dst,
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight)
{
__local float smem[16 + 2 * 8][16 + 2 * 8];
int x = get_local_id(0);
int y = get_local_id(1);
int gx = get_global_id(0);
int gy = get_global_id(1);
// x | x 0 | 0
// -----------
// x | x 0 | 0
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
// x | x 0 | 0
// -----------
// x | x 0 | 0
smem[y + 16][x] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
barrier(CLK_LOCAL_MEM_FENCE);
if (gx < cols && gy < rows)
{
float res = 0;
for (int i = 0; i < kHeight; ++i)
{
for (int j = 0; j < kWidth; ++j)
{
res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * (k_step>>2) + j];
}
}
dst[gy*(dst_step >> 2)+gx] = res;
}
}

View File

@@ -8,6 +8,7 @@
// @Authors
// Niko Li, newlife20080214@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Xu Pang, pangxu010@163.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
@@ -33,89 +34,127 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
#define PARTITAL_HISTGRAM256_COUNT (256)
#define PARTIAL_HISTOGRAM256_COUNT (256)
#define HISTOGRAM256_BIN_COUNT (256)
#define HISTGRAM256_WORK_GROUP_SIZE (256)
#define HISTGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT)
#define HISTOGRAM256_WORK_GROUP_SIZE (256)
#define HISTOGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT)
__kernel __attribute__((reqd_work_group_size(256,1,1)))void calc_sub_hist_D0(__global const uchar4* src,
int src_step,
int src_offset,
__global int* buf,
int data_count,
int cols,
int inc_x,
int inc_y,
int dst_offset)
#define NBANKS (16)
#define NBANKS_BIT (4)
__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
__global const uint4* src,
int src_step, int src_offset,
__global int* globalHist,
int dataCount, int cols,
int inc_x, int inc_y,
int hist_step)
{
int x = get_global_id(0);
int lx = get_local_id(0);
int gx = get_group_id(0);
int total_threads = get_global_size(0);
src += src_offset;
__local int s_hist[HISTGRAM256_LOCAL_MEM_SIZE];
s_hist[lx] = 0;
int pos_y = x / cols;
int pos_x = x - mul24(pos_y, cols);
barrier(CLK_LOCAL_MEM_FENCE);
for(int pos = x; pos < data_count; pos += total_threads)
{
int4 data = convert_int4(src[mad24(pos_y,src_step,pos_x)]);
atomic_inc(s_hist + data.x);
atomic_inc(s_hist + data.y);
atomic_inc(s_hist + data.z);
atomic_inc(s_hist + data.w);
pos_x +=inc_x;
int off = (pos_x >= cols ? -1 : 0);
pos_x = mad24(off,cols,pos_x);
pos_y += inc_y - off;
//pos_x = pos_x > cols ? pos_x - cols : pos_x;
//pos_y = pos_x > cols ? pos_y + 1 : pos_y;
}
barrier(CLK_LOCAL_MEM_FENCE);
buf[ mad24(gx, dst_offset, lx)] = s_hist[lx];
__local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
int gid = get_global_id(0);
int lid = get_local_id(0);
int gx = get_group_id(0);
int gsize = get_global_size(0);
int lsize = get_local_size(0);
const int shift = 8;
const int mask = HISTOGRAM256_BIN_COUNT-1;
int offset = (lid & (NBANKS-1));// lid % NBANKS
uint4 data, temp1, temp2, temp3, temp4;
src += src_offset;
//clear LDS
for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
{
subhist[idx] = 0;
subhist[idx+=lsize] = 0;
subhist[idx+=lsize] = 0;
subhist[idx+=lsize] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
//read and scatter
int y = gid/cols;
int x = gid - mul24(y, cols);
for(int idx=gid; idx<dataCount; idx+=gsize)
{
data = src[mad24(y, src_step, x)];
temp1 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp2 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp3 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp4 = ((data & mask) << NBANKS_BIT) + offset;
atomic_inc(subhist + temp1.x);
atomic_inc(subhist + temp1.y);
atomic_inc(subhist + temp1.z);
atomic_inc(subhist + temp1.w);
atomic_inc(subhist + temp2.x);
atomic_inc(subhist + temp2.y);
atomic_inc(subhist + temp2.z);
atomic_inc(subhist + temp2.w);
atomic_inc(subhist + temp3.x);
atomic_inc(subhist + temp3.y);
atomic_inc(subhist + temp3.z);
atomic_inc(subhist + temp3.w);
atomic_inc(subhist + temp4.x);
atomic_inc(subhist + temp4.y);
atomic_inc(subhist + temp4.z);
atomic_inc(subhist + temp4.w);
x += inc_x;
int off = ((x>=cols) ? -1 : 0);
x = mad24(off, cols, x);
y += inc_y - off;
}
barrier(CLK_LOCAL_MEM_FENCE);
//reduce local banks to single histogram per workgroup
int bin1=0, bin2=0, bin3=0, bin4=0;
for(int i=0; i<NBANKS; i+=4)
{
bin1 += subhist[(lid << NBANKS_BIT) + i];
bin2 += subhist[(lid << NBANKS_BIT) + i+1];
bin3 += subhist[(lid << NBANKS_BIT) + i+2];
bin4 += subhist[(lid << NBANKS_BIT) + i+3];
}
globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
}
__kernel void __attribute__((reqd_work_group_size(1,256,1)))calc_sub_hist2_D0( __global const uchar* src,
int src_step,
int src_offset,
__global int* buf,
int left_col,
int cols,
int rows,
int dst_offset)
__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))calc_sub_hist_border_D0(
__global const uchar* src,
int src_step, int src_offset,
__global int* globalHist,
int left_col, int cols,
int rows, int hist_step)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gx = get_group_id(0);
int gy = get_group_id(1);
int gnum = get_num_groups(0);
int output_row = mad24(gy,gnum,gx);
//int lidx = get_local_id(0);
int lidy = get_local_id(1);
int lidy = get_local_id(1);
int gx = get_group_id(0);
int gy = get_group_id(1);
int gn = get_num_groups(0);
int rowIndex = mad24(gy, gn, gx);
rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1);
__local int s_hist[HISTGRAM256_LOCAL_MEM_SIZE+1];
s_hist[lidy] = 0;
//mem_fence(CLK_LOCAL_MEM_FENCE);
__local int subhist[HISTOGRAM256_BIN_COUNT + 1];
subhist[lidy] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
gidx = ((gidx>left_col) ? (gidx+cols) : gidx);
int src_index = src_offset + mad24(gidy, src_step, gidx);
int p = (int)src[src_index];
atomic_inc(subhist + p);
barrier(CLK_LOCAL_MEM_FENCE);
//clamp(gidx,mask,cols-1);
gidx = gidx >= left_col ? cols+gidx : gidx;
//gidy = gidy >= rows?rows-1:gidy;
int src_index = src_offset + mad24(gidy,src_step,gidx);
//int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
//uchar4 p,q;
barrier(CLK_LOCAL_MEM_FENCE);
int p = (int)src[src_index];
p = gidy >= rows ? HISTGRAM256_LOCAL_MEM_SIZE : p;
atomic_inc(s_hist + p);
barrier(CLK_LOCAL_MEM_FENCE);
buf[ mad24(output_row, dst_offset, lidy)] += s_hist[lidy];
globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
}
__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
__global int* hist,
@@ -126,13 +165,13 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global
int sum = 0;
for(int i = lx; i < PARTITAL_HISTGRAM256_COUNT; i += HISTGRAM256_WORK_GROUP_SIZE)
for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE)
sum += buf[ mad24(i, src_step, gx)];
__local int data[HISTGRAM256_WORK_GROUP_SIZE];
__local int data[HISTOGRAM256_WORK_GROUP_SIZE];
data[lx] = sum;
for(int stride = HISTGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(lx < stride)

View File

@@ -109,10 +109,10 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
int4 val1, val2, val;
int4 sdata1, sdata2, sdata3, sdata4;
int4 pos1 = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
int4 pos2 = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
int4 pos3 = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
int4 pos4 = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
int4 pos1 = mad24((int4)y, (int4)srcstep_in_pixel, x+(int4)srcoffset_in_pixel);
int4 pos2 = mad24((int4)y, (int4)srcstep_in_pixel, x_+(int4)srcoffset_in_pixel);
int4 pos3 = mad24((int4)y_, (int4)srcstep_in_pixel, x+(int4)srcoffset_in_pixel);
int4 pos4 = mad24((int4)y_, (int4)srcstep_in_pixel, x_+(int4)srcoffset_in_pixel);
sdata1.s0 = src[pos1.s0];
sdata1.s1 = src[pos1.s1];
@@ -136,7 +136,7 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
val1 = mul24(U1 , sdata1) + mul24(U , sdata2);
val2 = mul24(U1 , sdata3) + mul24(U , sdata4);
val = mul24(V1 , val1) + mul24(V , val2);
val = mul24((int4)V1 , val1) + mul24((int4)V , val2);
//__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx);
//uchar4 dVal = *d;
@@ -205,8 +205,8 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int4 data1 = convert_int4(src[srcpos.y]);
int4 data2 = convert_int4(src[srcpos.z]);
int4 data3 = convert_int4(src[srcpos.w]);
int4 val = mul24(mul24(U1, V1) , data0) + mul24(mul24(U, V1) , data1)
+mul24(mul24(U1, V) , data2)+mul24(mul24(U, V) , data3);
int4 val = mul24((int4)mul24(U1, V1) , data0) + mul24((int4)mul24(U, V1) , data1)
+mul24((int4)mul24(U1, V) , data2)+mul24((int4)mul24(U, V) , data3);
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
uchar4 uval = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
@@ -314,7 +314,7 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
sy = min((int)floor(s5), src_rows-1);
uchar4 val;
int4 pos = mad24(sy, srcstep_in_pixel, sx+srcoffset_in_pixel);
int4 pos = mad24((int4)sy, (int4)srcstep_in_pixel, sx+(int4)srcoffset_in_pixel);
val.s0 = src[pos.s0];
val.s1 = src[pos.s1];
val.s2 = src[pos.s2];

File diff suppressed because it is too large Load Diff

View File

@@ -91,8 +91,8 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
F4 X0 = M[0]*DX + M[1]*dy + M[2];
F4 Y0 = M[3]*DX + M[4]*dy + M[5];
F4 W = M[6]*DX + M[7]*dy + M[8];
W = (W!=0) ? 1./W : 0;
F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0;
W = (W!=zero) ? one/W : zero;
short4 X = convert_short4(rint(X0*W));
short4 Y = convert_short4(rint(Y0*W));
int4 sx = convert_int4(X);

View File

@@ -34,7 +34,8 @@
//
//
#define F float
#define F2 float2
#define F4 float4
__kernel void convert_to_S4_C1_D0(
__global const int* restrict srcMat,
__global uchar* dstMat,
@@ -56,17 +57,41 @@ __kernel void convert_to_S4_C1_D0(
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if ( (x < cols + off_src) & (y < rows) )
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);
//int trans_src[10] = {temp_src1.y,temp_src1.z,temp_src1.w,temp_src.x,temp_src.y,temp_src.z,temp_src.w,temp_src2.x,temp_src2.y,temp_src2.z};
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end) ? convert_uchar_sat(temp_src.x*alpha+beta) : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end) ? convert_uchar_sat(temp_src.y*alpha+beta) : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end) ? convert_uchar_sat(temp_src.z*alpha+beta) : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end) ? convert_uchar_sat(temp_src.w*alpha+beta) : temp_dst.w;
*(__global uchar4*)(dstMat+dstidx) = temp_dst;
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
if(x+3<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
}
}
}
@@ -114,17 +139,41 @@ __kernel void convert_to_S5_C1_D0(
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if ( (x < cols + off_src) & (y < rows) )
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);
//int trans_src[10] = {temp_src1.y,temp_src1.z,temp_src1.w,temp_src.x,temp_src.y,temp_src.z,temp_src.w,temp_src2.x,temp_src2.y,temp_src2.z};
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end) ? convert_uchar_sat(temp_src.x*alpha+beta) : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end) ? convert_uchar_sat(temp_src.y*alpha+beta) : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end) ? convert_uchar_sat(temp_src.z*alpha+beta) : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end) ? convert_uchar_sat(temp_src.w*alpha+beta) : temp_dst.w;
*(__global uchar4*)(dstMat+dstidx) = temp_dst;
float4 temp_src = vload4(0,srcMat+srcidx);
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
if(x+3<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = vload2(0,srcMat+srcidx);
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
}
}
}
__kernel void convert_to_S5_C4_D0(

View File

@@ -34,62 +34,9 @@
//
//
__kernel void copy_to_with_mask_C1_D0(
__global const uchar* restrict srcMat,
__global uchar* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
int dst_addr_start = mad24((uint)y, (uint)dstStep_in_pixel, (uint)dstoffset_in_pixel);
int dst_addr_end = mad24((uint)y, (uint)dstStep_in_pixel, (uint)cols+dstoffset_in_pixel);
int dstidx = mad24((uint)y, (uint)dstStep_in_pixel, (uint)x+ dstoffset_in_pixel) & (int)0xfffffffc;
int vector_off = dstoffset_in_pixel & 3;
int srcidx = mad24((uint)y, (uint)srcStep_in_pixel, (uint)x + srcoffset_in_pixel - vector_off);
int mask_addr_start = mad24((uint)y, (uint)maskStep, (uint)maskoffset);
int mask_addr_end = mad24((uint)y, (uint)maskStep, (uint)cols+maskoffset);
int maskidx = mad24((uint)y, (uint)maskStep, (uint)x + maskoffset - vector_off);
if ( (x < cols + dstoffset_in_pixel) & (y < rows) )
{
uchar4 src_data = vload4(0, srcMat + srcidx);
uchar4 mask_data = vload4(0, maskMat + maskidx);
uchar4 dst_data = *((__global uchar4 *)(dstMat + dstidx));
uchar4 tmp_data;
mask_data.x = ((maskidx + 0 >= mask_addr_start) && (maskidx + 0 < mask_addr_end)) ? mask_data.x : 0;
mask_data.y = ((maskidx + 1 >= mask_addr_start) && (maskidx + 1 < mask_addr_end)) ? mask_data.y : 0;
mask_data.z = ((maskidx + 2 >= mask_addr_start) && (maskidx + 2 < mask_addr_end)) ? mask_data.z : 0;
mask_data.w = ((maskidx + 3 >= mask_addr_start) && (maskidx + 3 < mask_addr_end)) ? mask_data.w : 0;
tmp_data.x = ((dstidx + 0 >= dst_addr_start) && (dstidx + 0 < dst_addr_end) && (mask_data.x))
? src_data.x : dst_data.x;
tmp_data.y = ((dstidx + 1 >= dst_addr_start) && (dstidx + 1 < dst_addr_end) && (mask_data.y))
? src_data.y : dst_data.y;
tmp_data.z = ((dstidx + 2 >= dst_addr_start) && (dstidx + 2 < dst_addr_end) && (mask_data.z))
? src_data.z : dst_data.z;
tmp_data.w = ((dstidx + 3 >= dst_addr_start) && (dstidx + 3 < dst_addr_end) && (mask_data.w))
? src_data.w : dst_data.w;
(*(__global uchar4*)(dstMat+dstidx)) = tmp_data;
}
}
__kernel void copy_to_with_mask_C4_D0(
__global const uchar4* restrict srcMat,
__global uchar4* dstMat,
__kernel void copy_to_with_mask(
__global const GENTYPE* restrict srcMat,
__global GENTYPE* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
@@ -102,107 +49,13 @@ __kernel void copy_to_with_mask_C4_D0(
{
int x=get_global_id(0);
int y=get_global_id(1);
x = x< cols ? x: cols-1;
y = y< rows ? y: rows-1;
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}
__kernel void copy_to_with_mask_C1_D4(
__global const int* restrict srcMat,
__global int* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}
__kernel void copy_to_with_mask_C4_D4(
__global const int4* restrict srcMat,
__global int4* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}
__kernel void copy_to_with_mask_C1_D5(
__global const float* restrict srcMat,
__global float* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}
__kernel void copy_to_with_mask_C4_D5(
__global const float4* restrict srcMat,
__global float4* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
if (mask)
{
dstMat[dstidx] = srcMat[srcidx];
}

View File

@@ -40,24 +40,40 @@ __kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
int idx = mad24(y,dstStep_in_pixel,(int)(x+ offset_in_pixel & (int)0xfffffffc));
//int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
//int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
uchar4 out;
out.x = out.y = out.z = out.w = scalar;
if ( (idx>=addr_start)&(idx+3 < addr_end) & (y < rows))
if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
{
*(__global uchar4*)(dstMat+idx) = out;
}
else if(y < rows)
else
{
uchar4 temp = *(__global uchar4*)(dstMat+idx);
temp.x = (idx>=addr_start)&(idx < addr_end)? out.x : temp.x;
temp.y = (idx+1>=addr_start)&(idx+1 < addr_end)? out.y : temp.y;
temp.z = (idx+2>=addr_start)&(idx+2 < addr_end)? out.z : temp.z;
temp.w = (idx+3>=addr_start)&(idx+3 < addr_end)? out.w : temp.w;
*(__global uchar4*)(dstMat+idx) = temp;
if((x+3 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
dstMat[idx+2] = out.z;
dstMat[idx+3] = out.w;
}
if((x+2 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
dstMat[idx+2] = out.z;
}
else if((x+1 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
}
else if((x < cols) && (y < rows))
{
dstMat[idx] = out.x;
}
}
}

View File

@@ -33,81 +33,6 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
/*
__kernel void set_to_with_mask_C1_D0(
float4 scalar,
__global uchar* dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = convert_uchar_sat(scalar.x);
}
}
*/
//#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void set_to_with_mask_C1_D0(
uchar scalar,
__global uchar* dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
int mask_addr_start = mad24(y,maskStep,maskoffset);
int mask_addr_end = mad24(y,maskStep,cols+maskoffset);
int maskidx = mad24(y,maskStep,x+ maskoffset & (int)0xfffffffc);
int off_mask = (maskoffset & 3) - (dstoffset_in_pixel & 3) +3;
if ( (x < cols) & (y < rows) )
{
uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);
uchar4 temp_mask1 = *(__global uchar4*)(maskMat+maskidx-4);
uchar4 temp_mask = *(__global uchar4*)(maskMat+maskidx);
uchar4 temp_mask2 = *(__global uchar4*)(maskMat+maskidx+4);
temp_mask1.x = (maskidx-4 >=mask_addr_start)&(maskidx-4 < mask_addr_end) ? temp_mask1.x : 0;
temp_mask1.y = (maskidx-3 >=mask_addr_start)&(maskidx-3 < mask_addr_end) ? temp_mask1.y : 0;
temp_mask1.z = (maskidx-2 >=mask_addr_start)&(maskidx-2 < mask_addr_end) ? temp_mask1.z : 0;
temp_mask1.w = (maskidx-1 >=mask_addr_start)&(maskidx-1 < mask_addr_end) ? temp_mask1.w : 0;
temp_mask.x = (maskidx >=mask_addr_start)&(maskidx < mask_addr_end) ? temp_mask.x : 0;
temp_mask.y = (maskidx+1 >=mask_addr_start)&(maskidx+1 < mask_addr_end) ? temp_mask.y : 0;
temp_mask.z = (maskidx+2 >=mask_addr_start)&(maskidx+2 < mask_addr_end) ? temp_mask.z : 0;
temp_mask.w = (maskidx+3 >=mask_addr_start)&(maskidx+3 < mask_addr_end) ? temp_mask.w : 0;
temp_mask2.x = (maskidx+4 >=mask_addr_start)&(maskidx+4 < mask_addr_end) ? temp_mask2.x : 0;
temp_mask2.y = (maskidx+5 >=mask_addr_start)&(maskidx+5 < mask_addr_end) ? temp_mask2.y : 0;
temp_mask2.z = (maskidx+6 >=mask_addr_start)&(maskidx+6 < mask_addr_end) ? temp_mask2.z : 0;
temp_mask2.w = (maskidx+7 >=mask_addr_start)&(maskidx+7 < mask_addr_end) ? temp_mask2.w : 0;
uchar trans_mask[10] = {temp_mask1.y,temp_mask1.z,temp_mask1.w,temp_mask.x,temp_mask.y,temp_mask.z,temp_mask.w,temp_mask2.x,temp_mask2.y,temp_mask2.z};
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end)& trans_mask[off_mask] ? scalar : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end)& trans_mask[off_mask+1] ? scalar : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end)& trans_mask[off_mask+2] ? scalar : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end)& trans_mask[off_mask+3] ? scalar : temp_dst.w;
*(__global uchar4*)(dstMat+dstidx) = temp_dst;
}
}
__kernel void set_to_with_mask(
GENTYPE scalar,
__global GENTYPE * dstMat,
@@ -121,10 +46,12 @@ __kernel void set_to_with_mask(
{
int x=get_global_id(0);
int y=get_global_id(1);
x = x< cols ? x: cols-1;
y = y< rows ? y: rows-1;
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
if (mask)
{
dstMat[dstidx] = scalar;
}

View File

@@ -16,7 +16,6 @@
//
// @Authors
// Dachuan Zhao, dachuan@multicorewareinc.com
// Yao Wang, bitwangyaoyao@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@@ -119,19 +118,81 @@ uchar4 round_uchar4_float4(float4 v)
return round_uchar4_int4(iv);
}
#define IDX_ROW_HIGH(y,last_row) (abs_diff((int)abs_diff(last_row,y),last_row) % ((last_row)+1))
#define IDX_ROW_LOW(y,last_row) (abs(y) % ((last_row) + 1))
#define IDX_COL_HIGH(x,last_col) abs_diff((int)abs_diff(x,last_col),last_col)
#define IDX_COL_LOW(x,last_col) (abs(x) % ((last_col) + 1))
int idx_row_low(int y, int last_row)
{
if(y < 0)
{
y = -y;
}
return y % (last_row + 1);
}
int idx_row_high(int y, int last_row)
{
int i;
int j;
if(last_row - y < 0)
{
i = (y - last_row);
}
else
{
i = (last_row - y);
}
if(last_row - i < 0)
{
j = i - last_row;
}
else
{
j = last_row - i;
}
return j % (last_row + 1);
}
int idx_row(int y, int last_row)
{
return IDX_ROW_LOW(IDX_ROW_HIGH(y,last_row),last_row);
return idx_row_low(idx_row_high(y, last_row), last_row);
}
int idx_col_low(int x, int last_col)
{
if(x < 0)
{
x = -x;
}
return x % (last_col + 1);
}
int idx_col_high(int x, int last_col)
{
int i;
int j;
if(last_col - x < 0)
{
i = (x - last_col);
}
else
{
i = (last_col - x);
}
if(last_col - i < 0)
{
j = i - last_col;
}
else
{
j = last_col - i;
}
return j % (last_col + 1);
}
int idx_col(int x, int last_col)
{
return IDX_COL_LOW(IDX_COL_HIGH(x,last_col),last_col);
return idx_col_low(idx_col_high(x, last_col), last_col);
}
__kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstOffset, int dstCols)
@@ -149,11 +210,11 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset
sum = 0;
sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)];
sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)];
sum = sum + 0.375f * ((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(x, last_col)];
sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)];
sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)];
sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]);
sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]);
sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(x, last_col)]);
sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]);
sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]);
smem[2 + get_local_id(0)] = sum;
@@ -163,11 +224,11 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset
sum = 0;
sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)];
sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)];
sum = sum + 0.375f * ((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(left_x, last_col)];
sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)];
sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)];
sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(left_x, last_col)]);
sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
smem[get_local_id(0)] = sum;
}
@@ -178,11 +239,11 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset
sum = 0;
sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)];
sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)];
sum = sum + 0.375f * ((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(right_x, last_col)];
sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)];
sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)];
sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(right_x, last_col)]);
sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
smem[4 + get_local_id(0)] = sum;
}
@@ -227,11 +288,11 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffse
sum = 0;
sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)]);
sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)]);
sum = sum + co1 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(x, last_col)]);
sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)]);
sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)]);
sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(x, last_col)]));
sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
smem[2 + get_local_id(0)] = sum;
@@ -241,11 +302,11 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffse
sum = 0;
sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]);
sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]);
sum = sum + co1 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(left_x, last_col)]);
sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]);
sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]);
sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
smem[get_local_id(0)] = sum;
}
@@ -256,11 +317,11 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffse
sum = 0;
sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]);
sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]);
sum = sum + co1 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(right_x, last_col)]);
sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]);
sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]);
sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
smem[4 + get_local_id(0)] = sum;
}

View File

@@ -1,427 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#define ROWSperTHREAD 21 // the number of rows a thread will process
#define BLOCK_W 128 // the thread block width (464)
#define N_DISPARITIES 8
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
int SQ(int a)
{
return a * a;
}
unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
volatile __local unsigned int *col_ssd, int radius)
{
unsigned int cache = 0;
unsigned int cache2 = 0;
for(int i = 1; i <= radius; i++)
cache += col_ssd[i];
col_ssd_cache[0] = cache;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < BLOCK_W - radius)
cache2 = col_ssd_cache[radius];
else
for(int i = radius + 1; i < (2 * radius + 1); i++)
cache2 += col_ssd[i];
return col_ssd[0] + cache + cache2;
}
uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
volatile __local unsigned int *col_ssd, int radius)
{
unsigned int ssd[N_DISPARITIES];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
int bestIdx = 0;
for (int i = 0; i < N_DISPARITIES; i++)
{
if (mssd == ssd[i])
bestIdx = i;
}
return (uint2)(mssd, bestIdx);
}
void StepDown(int idx1, int idx2, __global unsigned char* imageL,
__global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius)
{
unsigned char leftPixel1;
unsigned char leftPixel2;
unsigned char rightPixel1[8];
unsigned char rightPixel2[8];
unsigned int diff1, diff2;
leftPixel1 = imageL[idx1];
leftPixel2 = imageL[idx2];
idx1 = idx1 - d;
idx2 = idx2 - d;
rightPixel1[7] = imageR[idx1 - 7];
rightPixel1[0] = imageR[idx1 - 0];
rightPixel1[1] = imageR[idx1 - 1];
rightPixel1[2] = imageR[idx1 - 2];
rightPixel1[3] = imageR[idx1 - 3];
rightPixel1[4] = imageR[idx1 - 4];
rightPixel1[5] = imageR[idx1 - 5];
rightPixel1[6] = imageR[idx1 - 6];
rightPixel2[7] = imageR[idx2 - 7];
rightPixel2[0] = imageR[idx2 - 0];
rightPixel2[1] = imageR[idx2 - 1];
rightPixel2[2] = imageR[idx2 - 2];
rightPixel2[3] = imageR[idx2 - 3];
rightPixel2[4] = imageR[idx2 - 4];
rightPixel2[5] = imageR[idx2 - 5];
rightPixel2[6] = imageR[idx2 - 6];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
diff1 = leftPixel1 - rightPixel1[0];
diff2 = leftPixel2 - rightPixel2[0];
col_ssd[0 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[1];
diff2 = leftPixel2 - rightPixel2[1];
col_ssd[1 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[2];
diff2 = leftPixel2 - rightPixel2[2];
col_ssd[2 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[3];
diff2 = leftPixel2 - rightPixel2[3];
col_ssd[3 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[4];
diff2 = leftPixel2 - rightPixel2[4];
col_ssd[4 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[5];
diff2 = leftPixel2 - rightPixel2[5];
col_ssd[5 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[6];
diff2 = leftPixel2 - rightPixel2[6];
col_ssd[6 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[7];
diff2 = leftPixel2 - rightPixel2[7];
col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
}
void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
__global unsigned char* imageR, int d,
volatile __local unsigned int *col_ssd, int radius)
{
unsigned char leftPixel1;
int idx;
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
for(int i = 0; i < (2 * radius + 1); i++)
{
idx = y_tex * im_pitch + x_tex;
leftPixel1 = imageL[idx];
idx = idx - d;
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
y_tex += 1;
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa[0];
col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa[1];
col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa[2];
col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa[3];
col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa[4];
col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa[5];
col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa[6];
col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
}
__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
__global unsigned int *cminSSDImage, int cminSSD_step,
__global unsigned char *disp, int disp_step,int cwidth, int cheight,
int img_step, int maxdisp, int radius,
__local unsigned int *col_ssd_cache)
{
volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
// int Y = get_group_id(1) * ROWSperTHREAD + radius;
#define Y (get_group_id(1) * ROWSperTHREAD + radius)
volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
__global unsigned char* disparImage = disp + X + Y * disp_step;
int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y);
int y_tex;
int x_tex = X - radius;
if (x_tex >= cwidth)
return;
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
{
y_tex = Y - radius;
InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd, radius);
if (col_ssd_extra > 0)
if (x_tex + BLOCK_W < cwidth)
InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra, radius);
barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
if (X < cwidth - radius && Y < cheight - radius)
{
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
if (minSSD.x < minSSDImage[0])
{
disparImage[0] = (unsigned char)(d + minSSD.y);
minSSDImage[0] = minSSD.x;
}
}
for(int row = 1; row < end_row; row++)
{
int idx1 = y_tex * img_step + x_tex;
int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex;
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
StepDown(idx1, idx2, left, right, d, col_ssd, radius);
if (col_ssd_extra > 0)
if (x_tex + BLOCK_W < cwidth)
StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra, radius);
y_tex += 1;
barrier(CLK_LOCAL_MEM_FENCE);
if (X < cwidth - radius && row < cheight - radius - Y)
{
int idx = row * cminSSD_step;
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
if (minSSD.x < minSSDImage[idx])
{
disparImage[disp_step * row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x;
}
}
} // for row loop
} // for d loop
}
//////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
int rows, int cols, int prefilterCap)
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < cols && y < rows)
{
int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) +
input[(y) * cols + (x-1)] * (-2) + input[(y) * cols + (x+1)] * (2) +
input[(y+1) * cols + (x-1)] * (-1) + input[(y+1) * cols + (x+1)] * (1);
cov = min(min(max(-prefilterCap, cov), prefilterCap) + prefilterCap, 255);
output[y * cols + x] = cov & 0xFF;
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
{
float conv = 0;
int y1 = y==0? 0 : y-1;
int x1 = x==0? 0 : x-1;
if(x < cols && y < rows)
{
conv = (float)input[(y1) * cols + (x1)] * (-1) + (float)input[(y1) * cols + (x+1)] * (1) +
(float)input[(y) * cols + (x1)] * (-2) + (float)input[(y) * cols + (x+1)] * (2) +
(float)input[(y+1) * cols + (x1)] * (-1) + (float)input[(y+1) * cols + (x+1)] * (1);
}
return fabs(conv);
}
float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
{
float cache = 0;
float cache2 = 0;
int winsz2 = winsz/2;
int x = get_local_id(0);
int group_size_x = get_local_size(0);
for(int i = 1; i <= winsz2; i++)
cache += cols[i];
cols_cache[0] = cache;
barrier(CLK_LOCAL_MEM_FENCE);
if (x < group_size_x - winsz2)
cache2 = cols_cache[winsz2];
else
for(int i = winsz2 + 1; i < winsz; i++)
cache2 += cols[i];
return cols[0] + cache + cache2;
}
#define RpT (2 * ROWSperTHREAD) // got experimentally
__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols,
int disp_step, __global unsigned char *input, int input_rows,
int input_cols,int winsz, float threshold,
__local float *cols_cache)
{
int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2;
int local_id_x = get_local_id(0);
int group_size_x = get_local_size(0);
int group_id_y = get_group_id(1);
__local float *cols = cols_cache + group_size_x + local_id_x;
__local float *cols_extra = local_id_x < n_dirty_pixels ? cols + group_size_x : 0;
int x = get_global_id(0);
int beg_row = group_id_y * RpT;
int end_row = min(beg_row + RpT, disp_rows);
if (x < disp_cols)
{
int y = beg_row;
float sum = 0;
float sum_extra = 0;
for(int i = y - winsz2; i <= y + winsz2; ++i)
{
sum += sobel(input, x - winsz2, i, input_rows, input_cols);
if (cols_extra)
sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols);
}
*cols = sum;
if (cols_extra)
*cols_extra = sum_extra;
barrier(CLK_LOCAL_MEM_FENCE);
float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
if (sum_win < threshold)
disp[y * disp_step + x] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
for(int y = beg_row + 1; y < end_row; ++y)
{
sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
*cols = sum;
if (cols_extra)
{
sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
+ sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
*cols_extra = sum_extra;
}
barrier(CLK_LOCAL_MEM_FENCE);
float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
if (sum_win < threshold)
disp[y * disp_step + x] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}

View File

@@ -1,580 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
///////////////////////////////////////////////////////////////
/////////////////common///////////////////////////////////////
/////////////////////////////////////////////////////////////
short round_short(float v){
return convert_short_sat_rte(v);
}
#define FLOAT_MAX 3.402823466e+38f
typedef struct
{
int cndisp;
float cmax_data_term;
float cdata_weight;
float cmax_disc_term;
float cdisc_single_jump;
}con_srtuct_t;
///////////////////////////////////////////////////////////////
////////////////////////// comp data //////////////////////////
///////////////////////////////////////////////////////////////
float pix_diff_1(__global const uchar *ls, __global const uchar *rs)
{
return abs((int)(*ls) - *rs);
}
float pix_diff_3(__global const uchar *ls, __global const uchar *rs)
{
const float tr = 0.299f;
const float tg = 0.587f;
const float tb = 0.114f;
float val;
val = tb * abs((int)ls[0] - rs[0]);
val += tg * abs((int)ls[1] - rs[1]);
val += tr * abs((int)ls[2] - rs[2]);
return val;
}
float pix_diff_4(__global const uchar *ls, __global const uchar *rs)
{
uchar4 l, r;
l = *((__global uchar4 *)ls);
r = *((__global uchar4 *)rs);
const float tr = 0.299f;
const float tg = 0.587f;
const float tb = 0.114f;
float val;
val = tb * abs((int)l.x - r.x);
val += tg * abs((int)l.y - r.y);
val += tr * abs((int)l.z - r.z);
return val;
}
__kernel void comp_data_0(__global uchar *left, int left_rows, int left_cols, int left_step,
__global uchar *right, int right_step,
__global short *data, int data_cols, int data_step,
__constant con_srtuct_t *con_st, int cn)
// int cndisp, float cmax_data_term, float cdata_weight, int cn)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y > 0 && y < (left_rows - 1) && x > 0 && x < (left_cols - 1))
{
const __global uchar* ls = left + y * left_step + x * cn;
const __global uchar* rs = right + y * right_step + x * cn;
__global short *ds = (__global short *)((__global uchar *)data + y * data_step) + x;
const unsigned int disp_step = data_cols * left_rows ;
for (int disp = 0; disp < con_st -> cndisp; disp++)
{
if (x - disp >= 1)
{
float val = 0;
if(cn == 1)
val = pix_diff_1(ls, rs - disp * cn);
if(cn == 3)
val = pix_diff_3(ls, rs - disp * cn);
if(cn == 4)
val = pix_diff_4(ls, rs - disp *cn);
ds[disp * disp_step] = round_short(fmin(con_st -> cdata_weight * val,
con_st -> cdata_weight * con_st -> cmax_data_term));
}
else
{
ds[disp * disp_step] = round_short(con_st -> cdata_weight * con_st -> cmax_data_term);
}
}
}
}
__kernel void comp_data_1(__global uchar *left, int left_rows, int left_cols, int left_step,
__global uchar *right, int right_step,
__global float *data, int data_cols, int data_step,
__constant con_srtuct_t *con_st, int cn)
//int cndisp, float cmax_data_term, float cdata_weight, int cn)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y > 0 && y < left_rows - 1 && x > 0 && x < left_cols - 1)
{
const __global uchar* ls = left + y * left_step + x * cn;
const __global uchar* rs = right + y * right_step + x * cn;
__global float *ds = (__global float *)((__global char *)data + y * data_step) + x;
const unsigned int disp_step = data_cols * left_rows;
for (int disp = 0; disp < con_st -> cndisp; disp++)
{
if (x - disp >= 1)
{
float val = 0;
if(cn == 1)
val = pix_diff_1(ls, rs - disp * cn);
if(cn == 3)
val = pix_diff_3(ls, rs - disp * cn);
if(cn == 4)
val = pix_diff_4(ls, rs - disp *cn);
ds[disp * disp_step] = fmin(con_st -> cdata_weight * val,
con_st -> cdata_weight * con_st -> cmax_data_term);
}
else
{
ds[disp * disp_step] = con_st -> cdata_weight * con_st -> cmax_data_term;
}
}
}
}
///////////////////////////////////////////////////////////////
//////////////////////// data step down ///////////////////////
///////////////////////////////////////////////////////////////
__kernel void data_step_down_0(__global short *src, int src_rows, int src_cols,
__global short *dst, int dst_rows, int dst_cols, int dst_real_cols,
int cndisp)
{
const int x = get_global_id(0);
const int y = get_global_id(1);;
if (x < dst_cols && y < dst_rows)
{
for (int d = 0; d < cndisp; ++d)
{
//float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
float dst_reg;
dst_reg = src[(d * src_rows + (2*y+0)) * src_cols + 2*x+0];
dst_reg += src[(d * src_rows + (2*y+1)) * src_cols + 2*x+0];
dst_reg += src[(d * src_rows + (2*y+0)) * src_cols + 2*x+1];
dst_reg += src[(d * src_rows + (2*y+1)) * src_cols + 2*x+1];
//dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
dst[(d * dst_rows + y) * dst_real_cols + x] = round_short(dst_reg);
}
}
}
__kernel void data_step_down_1(__global float *src, int src_rows, int src_cols,
__global float *dst, int dst_rows, int dst_cols, int dst_real_cols,
int cndisp)
{
const int x = get_global_id(0);
const int y = get_global_id(1);;
if (x < dst_cols && y < dst_rows)
{
for (int d = 0; d < cndisp; ++d)
{
//float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
float dst_reg;
dst_reg = src[(d * src_rows + (2*y+0)) * src_cols + 2*x+0];
dst_reg += src[(d * src_rows + (2*y+1)) * src_cols + 2*x+0];
dst_reg += src[(d * src_rows + (2*y+0)) * src_cols + 2*x+1];
dst_reg += src[(d * src_rows + (2*y+1)) * src_cols + 2*x+1];
//dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
dst[(d * dst_rows + y) * dst_real_cols + x] = round_short(dst_reg);
}
}
}
///////////////////////////////////////////////////////////////
/////////////////// level up messages ////////////////////////
///////////////////////////////////////////////////////////////
__kernel void level_up_message_0(__global short *src, int src_rows, int src_step,
__global short *dst, int dst_rows, int dst_cols, int dst_step,
int cndisp)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (x < dst_cols && y < dst_rows)
{
const int dst_disp_step = (dst_step / sizeof(short)) * dst_rows;
const int src_disp_step = (src_step / sizeof(short)) * src_rows;
__global short *dstr = (__global short *)((__global char *)dst + y * dst_step) + x;
__global const short *srcr = (__global short *)((__global char *)src + y/2 * src_step) + x/2;
for (int d = 0; d < cndisp; ++d)
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
}
}
__kernel void level_up_message_1(__global float *src, int src_rows, int src_step,
__global float *dst, int dst_rows, int dst_cols, int dst_step,
int cndisp)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (x < dst_cols && y < dst_rows)
{
const int dst_disp_step = (dst_step/sizeof(float)) * dst_rows;
const int src_disp_step = (src_step/sizeof(float)) * src_rows;
__global float *dstr = (__global float *)((__global char *)dst + y * dst_step) + x;
__global const float *srcr = (__global float *)((__global char *)src + y/2 * src_step) + x/2;
for (int d = 0; d < cndisp; ++d)
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
}
}
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
void calc_min_linear_penalty_0(__global short * dst, int disp_step,
int cndisp, float cdisc_single_jump)
{
float prev = dst[0];
float cur;
for (int disp = 1; disp < cndisp; ++disp)
{
prev += cdisc_single_jump;
cur = dst[disp_step * disp];
if (prev < cur)
{
cur = prev;
dst[disp_step * disp] = round_short(prev);
}
prev = cur;
}
prev = dst[(cndisp - 1) * disp_step];
for (int disp = cndisp - 2; disp >= 0; disp--)
{
prev += cdisc_single_jump;
cur = dst[disp_step * disp];
if (prev < cur)
{
cur = prev;
dst[disp_step * disp] = round_short(prev);
}
prev = cur;
}
}
void message_0(const __global short *msg1, const __global short *msg2,
const __global short *msg3, const __global short *data, __global short *dst,
int msg_disp_step, int data_disp_step, int cndisp, float cmax_disc_term, float cdisc_single_jump)
{
float minimum = FLOAT_MAX;
for(int i = 0; i < cndisp; ++i)
{
float dst_reg;
dst_reg = msg1[msg_disp_step * i];
dst_reg += msg2[msg_disp_step * i];
dst_reg += msg3[msg_disp_step * i];
dst_reg += data[data_disp_step * i];
if (dst_reg < minimum)
minimum = dst_reg;
dst[msg_disp_step * i] = round_short(dst_reg);
}
calc_min_linear_penalty_0(dst, msg_disp_step, cndisp, cdisc_single_jump);
minimum += cmax_disc_term;
float sum = 0;
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = dst[msg_disp_step * i];
if (dst_reg > minimum)
{
dst_reg = minimum;
dst[msg_disp_step * i] = round_short(minimum);
}
sum += dst_reg;
}
sum /= cndisp;
for(int i = 0; i < cndisp; ++i)
dst[msg_disp_step * i] -= sum;
}
__kernel void one_iteration_0(__global short *u, int u_step, int u_cols,
__global short *data, int data_step, int data_cols,
__global short *d, __global short *l, __global short *r,
int t, int cols, int rows,
int cndisp, float cmax_disc_term, float cdisc_single_jump)
{
const int y = get_global_id(1);
const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
{
__global short *us = (__global short *)((__global char *)u + y * u_step) + x;
__global short *ds = d + y * u_cols + x;
__global short *ls = l + y * u_cols + x;
__global short *rs = r + y * u_cols + x;
const __global short *dt = (__global short *)((__global char *)data + y * data_step) + x;
int msg_disp_step = u_cols * rows;
int data_disp_step = data_cols * rows;
message_0(us + u_cols, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
message_0(ds - u_cols, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
message_0(us + u_cols, ds - u_cols, rs - 1, dt, rs, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
message_0(us + u_cols, ds - u_cols, ls + 1, dt, ls, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
}
}
void calc_min_linear_penalty_1(__global float * dst, int step,
int cndisp, float cdisc_single_jump)
{
float prev = dst[0];
float cur;
for (int disp = 1; disp < cndisp; ++disp)
{
prev += cdisc_single_jump;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = prev;
}
prev = cur;
}
prev = dst[(cndisp - 1) * step];
for (int disp = cndisp - 2; disp >= 0; disp--)
{
prev += cdisc_single_jump;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = prev;
}
prev = cur;
}
}
void message_1(const __global float *msg1, const __global float *msg2,
const __global float *msg3, const __global float *data, __global float *dst,
int msg_disp_step, int data_disp_step, int cndisp, float cmax_disc_term, float cdisc_single_jump)
{
float minimum = FLOAT_MAX;
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = 0;
dst_reg = msg1[msg_disp_step * i];
dst_reg += msg2[msg_disp_step * i];
dst_reg += msg3[msg_disp_step * i];
dst_reg += data[data_disp_step * i];
if (dst_reg < minimum)
minimum = dst_reg;
dst[msg_disp_step * i] = dst_reg;
}
calc_min_linear_penalty_1(dst, msg_disp_step, cndisp, cdisc_single_jump);
minimum += cmax_disc_term;
float sum = 0;
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = dst[msg_disp_step * i];
if (dst_reg > minimum)
{
dst_reg = minimum;
dst[msg_disp_step * i] = minimum;
}
sum += dst_reg;
}
sum /= cndisp;
for(int i = 0; i < cndisp; ++i)
dst[msg_disp_step * i] -= sum;
}
__kernel void one_iteration_1(__global float *u, int u_step, int u_cols,
__global float *data, int data_step, int data_cols,
__global float *d, __global float *l, __global float *r,
int t, int cols, int rows,
int cndisp,float cmax_disc_term, float cdisc_single_jump)
{
const int y = get_global_id(1);
const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
{
__global float* us = (__global float *)((__global char *)u + y * u_step) + x;
__global float* ds = d + y * u_cols + x;
__global float* ls = l + y * u_cols + x;
__global float* rs = r + y * u_cols + x;
const __global float* dt = (__global float *)((__global char *)data + y * data_step) + x;
int msg_disp_step = u_cols * rows;
int data_disp_step = data_cols * rows;
message_1(us + u_cols, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
message_1(ds - u_cols, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
message_1(us + u_cols, ds - u_cols, rs - 1, dt, rs, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
message_1(us + u_cols, ds - u_cols, ls + 1, dt, ls, msg_disp_step, data_disp_step, cndisp,
cmax_disc_term, cdisc_single_jump);
}
}
///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
__kernel void output_0(const __global short *u, int u_step, int u_cols,
const __global short *d, const __global short *l,
const __global short *r, const __global short *data,
__global short *disp, int disp_rows, int disp_cols, int disp_step,
int cndisp)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
{
const __global short *us =(__global short *)((__global char *)u + (y + 1) * u_step) + x;
const __global short *ds = d + (y - 1) * u_cols + x;
const __global short *ls = l + y * u_cols + (x + 1);
const __global short *rs = r + y * u_cols + (x - 1);
const __global short *dt = data + y * u_cols + x;
int disp_steps = disp_rows * u_cols;
int best = 0;
float best_val = FLOAT_MAX;
for (int d = 0; d < cndisp; ++d)
{
float val;
val = us[d * disp_steps];
val += ds[d * disp_steps];
val += ls[d * disp_steps];
val += rs[d * disp_steps];
val += dt[d * disp_steps];
if (val < best_val)
{
best_val = val;
best = d;
}
}
((__global short *)((__global char *)disp + y * disp_step))[x] = convert_short_sat(best);
}
}
__kernel void output_1(const __global float *u, int u_step, int u_cols,
const __global float *d, const __global float *l,
const __global float *r, const __global float *data,
__global short *disp, int disp_rows, int disp_cols, int disp_step,
int cndisp)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
{
const __global float *us =(__global float *)((__global char *)u + (y + 1) * u_step) + x;
const __global float *ds = d + (y - 1) * u_cols + x;
const __global float *ls = l + y * u_cols + (x + 1);
const __global float *rs = r + y * u_cols + (x - 1);
const __global float *dt = data + y * u_cols + x;
int disp_steps = disp_rows * u_cols;
int best = 0;
float best_val = FLOAT_MAX;
for (int d = 0; d < cndisp; ++d)
{
float val;
val = us[d * disp_steps];
val += ds[d * disp_steps];
val += ls[d * disp_steps];
val += rs[d * disp_steps];
val += dt[d * disp_steps];
if (val < best_val)
{
best_val = val;
best = d;
}
}
//disp[y * disp_cols + x] = convert_short_sat(best);
((__global short *)((__global char *)disp + y * disp_step))[x] = convert_short_sat(best);
}
}

File diff suppressed because it is too large Load Diff