refactored and extended ocl::addWeighted
This commit is contained in:
parent
b20bd470fe
commit
161674bff2
modules/ocl/src
@ -1795,64 +1795,66 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
|
||||
|
||||
void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
|
||||
src1.rows == src2.rows && src2.rows == dst.rows);
|
||||
CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
|
||||
|
||||
Context *clCxt = src1.clCxt;
|
||||
bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
|
||||
if (!hasDouble && src1.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
|
||||
return;
|
||||
}
|
||||
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
int channels = dst.oclchannels();
|
||||
int depth = dst.depth();
|
||||
|
||||
int cols1 = src1.cols * channels;
|
||||
int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
|
||||
int src2step1 = src2.step1(), src2offset1 = src2.offset / src1.elemSize1();
|
||||
int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
|
||||
|
||||
int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
|
||||
{4, 0, 4, 4, 4, 4, 4},
|
||||
{4, 0, 4, 4, 4, 4, 4},
|
||||
{4, 0, 4, 4, 4, 4, 4}
|
||||
};
|
||||
|
||||
|
||||
size_t vector_length = vector_lengths[channels - 1][depth];
|
||||
int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
|
||||
int cols = divUp(dst.cols * channels + offset_cols, vector_length);
|
||||
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
|
||||
std::string buildOptions = format("-D T=%s -D WT=%s -D convertToT=convert_%s%s",
|
||||
typeMap[depth], hasDouble ? "double" : "float", typeMap[depth],
|
||||
depth >= CV_32F ? "" : "_sat_rte");
|
||||
|
||||
size_t localThreads[3] = { 256, 1, 1 };
|
||||
size_t globalThreads[3] = { cols, dst.rows, 1};
|
||||
size_t globalThreads[3] = { cols1, dst.rows, 1};
|
||||
|
||||
float alpha_f = static_cast<float>(alpha),
|
||||
beta_f = static_cast<float>(beta),
|
||||
gama_f = static_cast<float>(gama);
|
||||
|
||||
int dst_step1 = dst.cols * dst.elemSize();
|
||||
int src1_step = (int) src1.step;
|
||||
int src2_step = (int) src2.step;
|
||||
int dst_step = (int) dst.step;
|
||||
float alpha_f = alpha, beta_f = beta, gama_f = gama;
|
||||
vector<pair<size_t , const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1));
|
||||
|
||||
if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
|
||||
}
|
||||
else
|
||||
if (!hasDouble)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&alpha_f ));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&beta_f ));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&gama_f ));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
|
||||
}
|
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
|
||||
openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
|
||||
openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads,
|
||||
args, -1, -1, buildOptions.c_str());
|
||||
}
|
||||
|
||||
static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString)
|
||||
|
@ -42,392 +42,34 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
typedef double F;
|
||||
#else
|
||||
typedef float F;
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
|
||||
__global uchar *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta,F gama,
|
||||
__global uchar *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
|
||||
__kernel void addWeighted(__global T * src1, int src1_step1, int src1_offset1,
|
||||
__global T * src2, int src2_step1, int src2_offset1,
|
||||
__global T * dst, int dst_step1, int dst_offset1,
|
||||
WT alpha, WT beta, WT gama,
|
||||
int cols1, int rows)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
if (x < cols1 && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step1, x + src1_offset1);
|
||||
int src2_index = mad24(y, src2_step1, x + src2_offset1);
|
||||
int dst_index = mad24(y, dst_step1, x + dst_offset1);
|
||||
|
||||
x = x << 2;
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data ,src2_data;
|
||||
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
|
||||
short4 tmp;
|
||||
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
|
||||
uchar4 tmp_data = convert_uchar4_sat(tmp);
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = dst_data;
|
||||
// dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
|
||||
dst[dst_index] = convertToT(src1[src1_index]*alpha + src2[src2_index]*beta + gama);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
|
||||
__global ushort *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta,F gama,
|
||||
__global ushort *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
|
||||
int4 tmp;
|
||||
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
|
||||
ushort4 tmp_data = convert_ushort4_sat(tmp);
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
|
||||
__global short *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta,F gama,
|
||||
__global short *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
|
||||
int4 tmp;
|
||||
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
|
||||
short4 tmp_data = convert_short4_sat(tmp);
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
__global int *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta, F gama,
|
||||
__global int *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> bitOfInt) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
|
||||
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
|
||||
float4 tmp;
|
||||
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
|
||||
int4 tmp_data = convert_int4_sat(tmp);
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
|
||||
__global float *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta, F gama,
|
||||
__global float *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
|
||||
|
||||
// float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
|
||||
float4 tmp_data;
|
||||
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
|
||||
// float4 tmp_data = convert_float4(tmp);
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
|
||||
__global double *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta, F gama,
|
||||
__global double *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
// double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
|
||||
double4 tmp_data;
|
||||
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 16 >= dst_start) && (dst_index + 16 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 24 >= dst_start) && (dst_index + 24 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global double4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user