Merge branch '2.4'
This commit is contained in:
@@ -205,7 +205,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
|
||||
clStridesIn[2] = is_row_dft ? clStridesIn[1] : dft_size.width * clStridesIn[1];
|
||||
clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
|
||||
|
||||
openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, (cl_context)getoclContext(), dim, clLengthsIn ) );
|
||||
openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getoclContext(), dim, clLengthsIn ) );
|
||||
|
||||
openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
|
||||
openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
|
||||
@@ -219,8 +219,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
|
||||
openCLSafeCall( clAmdFftSetPlanScale ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );
|
||||
|
||||
//ready to bake
|
||||
cl_command_queue clq = (cl_command_queue)getoclCommandQueue();
|
||||
openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &clq, NULL, NULL ) );
|
||||
openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getoclCommandQueue(), NULL, NULL ) );
|
||||
}
|
||||
cv::ocl::FftPlan::~FftPlan()
|
||||
{
|
||||
|
||||
@@ -351,6 +351,11 @@ namespace cv
|
||||
return &(Context::getContext()->impl->clCmdQueue);
|
||||
}
|
||||
|
||||
void finish()
|
||||
{
|
||||
clFinish(Context::getContext()->impl->clCmdQueue);
|
||||
}
|
||||
|
||||
void queryDeviceInfo(DEVICE_INFO info_type, void* info)
|
||||
{
|
||||
static Info::Impl* impl = Context::getContext()->impl;
|
||||
@@ -709,7 +714,7 @@ namespace cv
|
||||
clReleaseEvent(event);
|
||||
#endif
|
||||
|
||||
clFinish(clCxt->impl->clCmdQueue);
|
||||
clFlush(clCxt->impl->clCmdQueue);
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
|
||||
@@ -905,16 +910,18 @@ namespace cv
|
||||
std::auto_ptr<Context> Context::clCxt;
|
||||
int Context::val = 0;
|
||||
static Mutex cs;
|
||||
Context *Context::getContext()
|
||||
static volatile int context_tear_down = 0;
|
||||
Context* Context::getContext()
|
||||
{
|
||||
if(*((volatile int*)&val) != 1)
|
||||
{
|
||||
AutoLock al(cs);
|
||||
if(*((volatile int*)&val) != 1)
|
||||
{
|
||||
if (context_tear_down)
|
||||
return clCxt.get();
|
||||
if( 0 == clCxt.get())
|
||||
clCxt.reset(new Context);
|
||||
|
||||
std::vector<Info> oclinfo;
|
||||
CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
|
||||
oclinfo[0].impl->setDevice(0, 0, 0);
|
||||
@@ -1042,9 +1049,14 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD fdwReason, LPVOID )
|
||||
{
|
||||
// application hangs if call clReleaseCommandQueue here, so release context only
|
||||
// without context release application hangs as well
|
||||
cl_context ctx = (cl_context)getoclContext();
|
||||
if(ctx)
|
||||
openCLSafeCall(clReleaseContext(ctx));
|
||||
context_tear_down = 1;
|
||||
Context* cv_ctx = Context::getContext();
|
||||
if(cv_ctx)
|
||||
{
|
||||
cl_context ctx = (cl_context)&(cv_ctx->impl->oclcontext);
|
||||
if(ctx)
|
||||
openCLSafeCall(clReleaseContext(ctx));
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@@ -142,7 +142,7 @@ namespace cv
|
||||
format.image_channel_data_type = CL_FLOAT;
|
||||
break;
|
||||
default:
|
||||
throw std::exception();
|
||||
CV_Error(-1, "Image forma is not supported");
|
||||
break;
|
||||
}
|
||||
switch(channels)
|
||||
@@ -157,7 +157,7 @@ namespace cv
|
||||
format.image_channel_order = CL_RGBA;
|
||||
break;
|
||||
default:
|
||||
throw std::exception();
|
||||
CV_Error(-1, "Image forma is not supported");
|
||||
break;
|
||||
}
|
||||
#if CL_VERSION_1_2
|
||||
@@ -195,7 +195,8 @@ namespace cv
|
||||
const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
|
||||
clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
|
||||
regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
|
||||
}
|
||||
clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
|
||||
}
|
||||
else
|
||||
{
|
||||
devData = (cl_mem)mat.data;
|
||||
@@ -204,7 +205,7 @@ namespace cv
|
||||
clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0);
|
||||
if ((mat.cols * mat.elemSize() != mat.step))
|
||||
{
|
||||
clFinish((cl_command_queue)mat.clCxt->oclCommandQueue());
|
||||
clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
|
||||
clReleaseMemObject(devData);
|
||||
}
|
||||
|
||||
@@ -229,7 +230,8 @@ namespace cv
|
||||
try
|
||||
{
|
||||
cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
|
||||
//_support = true;
|
||||
finish();
|
||||
_support = true;
|
||||
}
|
||||
catch (const cv::Exception& e)
|
||||
{
|
||||
|
||||
@@ -44,7 +44,11 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -63,6 +67,9 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
@@ -111,7 +118,10 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -145,7 +155,10 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -249,7 +262,10 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -288,7 +304,10 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -319,7 +338,10 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global float *src1, int src1_step, int
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_absdiff_C1_D6 (__global double *src1, int src1_step, int src1_offset,
|
||||
__global double *dst, int dst_step, int dst_offset,
|
||||
double4 src2, int rows, int cols, int dst_step1)
|
||||
__global double *dst, int dst_step, int dst_offset,
|
||||
double4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -422,7 +444,10 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global ushort *src1, int src1_step, in
|
||||
}
|
||||
__kernel void arithm_s_absdiff_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global int *src1, int src1_step, int s
|
||||
}
|
||||
__kernel void arithm_s_absdiff_C2_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *dst, int dst_step, int dst_offset,
|
||||
float4 src2, int rows, int cols, int dst_step1)
|
||||
float4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -564,7 +589,10 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -618,7 +646,10 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
@@ -668,7 +699,10 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s
|
||||
int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
|
||||
int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int
|
||||
float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
|
||||
float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in
|
||||
double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
|
||||
double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -45,7 +45,11 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -42,8 +42,12 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#if defined DOUBLE_SUPPORT
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
typedef double F;
|
||||
#else
|
||||
typedef float F;
|
||||
@@ -52,10 +56,10 @@ typedef float F;
|
||||
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
|
||||
__global uchar *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta,F gama,
|
||||
__global uchar *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
__global uchar *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta,F gama,
|
||||
__global uchar *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
|
||||
short4 tmp;
|
||||
short4 tmp;
|
||||
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
|
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = dst_data;
|
||||
// dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
|
||||
// dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
|
||||
|
||||
|
||||
__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
|
||||
__global ushort *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta,F gama,
|
||||
__global ushort *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
__global ushort *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta,F gama,
|
||||
__global ushort *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -122,34 +129,37 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
|
||||
int4 tmp;
|
||||
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
|
||||
int4 tmp;
|
||||
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
@@ -182,7 +192,10 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
|
||||
int4 tmp;
|
||||
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
|
||||
int4 tmp;
|
||||
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
|
||||
|
||||
__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
__global int *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta, F gama,
|
||||
F alpha,F beta, F gama,
|
||||
__global int *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
|
||||
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
|
||||
|
||||
#define dst_align ((dst_offset >> bitOfInt) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> bitOfInt) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
|
||||
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
|
||||
@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
|
||||
float4 tmp;
|
||||
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
|
||||
float4 tmp;
|
||||
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
|
||||
__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
|
||||
__global float *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta, F gama,
|
||||
F alpha,F beta, F gama,
|
||||
__global float *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
@@ -304,7 +320,10 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
|
||||
|
||||
// float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
|
||||
float4 tmp_data;
|
||||
// float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
|
||||
float4 tmp_data;
|
||||
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
|
||||
// float4 tmp_data = convert_float4(tmp);
|
||||
// float4 tmp_data = convert_float4(tmp);
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
|
||||
__global double *src2, int src2_step,int src2_offset,
|
||||
F alpha,F beta, F gama,
|
||||
F alpha,F beta, F gama,
|
||||
__global double *dst, int dst_step,int dst_offset,
|
||||
int rows, int cols,int dst_step1)
|
||||
{
|
||||
@@ -366,7 +385,10 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
// double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
|
||||
double4 tmp_data;
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
// double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
|
||||
double4 tmp_data;
|
||||
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
|
||||
tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
|
||||
tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
|
||||
|
||||
@@ -44,9 +44,13 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
|
||||
#endif
|
||||
/**************************************add with scalar without mask**************************************/
|
||||
__kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
@@ -59,7 +63,10 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -99,7 +106,10 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -131,7 +141,10 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -233,7 +246,10 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -378,7 +394,10 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -432,7 +451,10 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
@@ -482,7 +504,10 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_
|
||||
int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
|
||||
int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src
|
||||
float tmp_data_1 = src1_data_1 + src2_data_1;
|
||||
float tmp_data_2 = src1_data_2 + src2_data_2;
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr
|
||||
double tmp_data_1 = src1_data_1 + src2_data_1;
|
||||
double tmp_data_2 = src1_data_2 + src2_data_2;
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -44,7 +44,11 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/**************************************add with scalar with mask**************************************/
|
||||
@@ -61,7 +65,10 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -111,7 +118,10 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -146,7 +156,10 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -267,7 +280,10 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -443,7 +459,10 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -501,7 +520,10 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
@@ -555,7 +577,10 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -43,7 +43,11 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -51,9 +55,9 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_and without mask**************************************/
|
||||
__kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -62,30 +66,33 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -112,7 +119,10 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
char4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
char4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
char4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
char4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
|
||||
@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -163,7 +173,10 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
ushort4 tmp_data = src1_data & src2_data;
|
||||
|
||||
@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -215,7 +228,10 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
short4 tmp_data = src1_data & src2_data;
|
||||
|
||||
@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
@@ -43,18 +43,22 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_and with mask**************************************/
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
|
||||
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data & src2_data;
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data & src2_data;
|
||||
|
||||
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
|
||||
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -274,12 +295,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_
|
||||
|
||||
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -305,15 +326,15 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -500,12 +532,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -530,15 +563,15 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
|
||||
@@ -42,19 +42,22 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//
|
||||
#if defined (__ATI__)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#elif defined (__NVIDIA__)
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************and with scalar without mask**************************************/
|
||||
__kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -63,7 +66,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -86,9 +92,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -97,7 +104,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -119,9 +129,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -131,7 +142,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -150,9 +164,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -162,7 +177,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -181,9 +199,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step,
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -202,9 +221,10 @@ __kernel void arithm_s_bitwise_and_C1_D4 (__global int *src1, int src1_step, i
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -232,11 +252,11 @@ __kernel void arithm_s_bitwise_and_C1_D5 (__global char *src1, int src1_step,
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -268,7 +289,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -302,7 +327,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_and_C2_D2 (__global ushort *src1, int src1_step
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_and_C2_D3 (__global short *src1, int src1_step,
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_and_C2_D4 (__global int *src1, int src1_step, i
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_and_C2_D5 (__global char *src1, int src1_step,
|
||||
char8 tmp_data = src1_data & src2_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -440,7 +474,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -496,7 +534,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -551,7 +593,10 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -601,7 +647,10 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step,
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_and_C3_D4 (__global int *src1, int src1_step, i
|
||||
int tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
int tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_and_C3_D5 (__global char *src1, int src1_step,
|
||||
char4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
|
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_and_C4_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_and_C4_D1 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_and_C4_D2 (__global ushort *src1, int src1_step
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_and_C4_D3 (__global short *src1, int src1_step,
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_and_C4_D4 (__global int *src1, int src1_step, i
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_and_C4_D5 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -897,10 +956,10 @@ __kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, i
|
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 & src2_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,20 +42,22 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#if defined (__ATI__)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#elif defined (__NVIDIA__)
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_and with scalar with mask**************************************/
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -65,7 +67,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -90,10 +95,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -103,7 +109,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -127,10 +136,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -140,7 +150,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -161,10 +174,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -174,7 +188,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -195,10 +212,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -223,10 +241,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global int *src1, int
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -252,10 +271,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -280,10 +300,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -293,7 +314,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -316,10 +340,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -329,7 +354,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int s
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -351,10 +379,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -378,10 +407,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -405,10 +435,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global short *src1, int
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -432,10 +463,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global int *src1, int sr
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -461,10 +493,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -489,10 +522,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -502,7 +536,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -549,10 +586,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -562,7 +600,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -608,10 +649,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -621,7 +663,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -650,22 +695,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -675,7 +721,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -704,22 +753,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -753,15 +803,16 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global int *src1, int sr
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -795,16 +846,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global char *src1, int s
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -838,16 +890,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -872,10 +925,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -899,10 +953,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -925,10 +980,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -951,10 +1007,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global short *src1, int
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -977,10 +1034,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global int *src1, int sr
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1006,10 +1064,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
|
||||
@@ -43,9 +43,12 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -61,25 +64,28 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = ~ src1_data;
|
||||
|
||||
/* if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
*/
|
||||
/* if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
*/
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -101,7 +107,10 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -135,7 +144,10 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -170,7 +182,10 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
@@ -43,7 +43,11 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -51,9 +55,9 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_or without mask**************************************/
|
||||
__kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -62,29 +66,32 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data | src2_data;
|
||||
|
||||
@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -110,7 +117,10 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -147,7 +157,10 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -185,7 +198,10 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
@@ -43,18 +43,22 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_or with mask**************************************/
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
|
||||
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data | src2_data;
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data | src2_data;
|
||||
|
||||
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
|
||||
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -273,13 +294,13 @@ __kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_s
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C1_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -308,12 +329,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_s
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -501,11 +533,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_s
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C2_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -533,12 +566,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_s
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_s
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C3_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_s
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_s
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_or_with_mask_C4_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
|
||||
@@ -43,16 +43,21 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************and with scalar without mask**************************************/
|
||||
__kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -61,7 +66,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -84,9 +92,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -95,7 +104,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, i
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -117,9 +129,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -129,7 +142,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -148,9 +164,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step,
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -160,7 +177,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -179,9 +199,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step,
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -200,9 +221,10 @@ __kernel void arithm_s_bitwise_or_C1_D4 (__global int *src1, int src1_step, in
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -222,9 +244,10 @@ __kernel void arithm_s_bitwise_or_C1_D5 (__global char *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -245,10 +268,10 @@ __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, in
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -259,7 +282,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -280,9 +306,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -293,7 +320,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -313,9 +343,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -335,9 +366,10 @@ __kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step,
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -358,8 +390,8 @@ __kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -378,9 +410,10 @@ __kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, in
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -400,9 +433,10 @@ __kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -423,9 +457,10 @@ __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, in
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -436,7 +471,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -480,9 +518,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -493,7 +532,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -536,9 +578,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -549,7 +592,10 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -575,21 +621,22 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -600,7 +647,10 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -626,21 +676,22 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -668,14 +719,15 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, in
|
||||
int tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
int tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -700,15 +752,16 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, i
|
||||
char4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -736,15 +789,16 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
|
||||
short4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -765,9 +819,10 @@ __kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -787,9 +842,10 @@ __kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -808,9 +864,10 @@ __kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step,
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -829,9 +886,10 @@ __kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step,
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -850,9 +908,10 @@ __kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, in
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -874,9 +933,10 @@ __kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -903,10 +963,10 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
|
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 | src2_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,17 +43,21 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_or with scalar with mask**************************************/
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -64,7 +68,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -89,10 +96,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -103,7 +111,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -127,10 +138,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -141,7 +153,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -162,10 +177,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -176,7 +192,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -197,10 +216,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -226,10 +246,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -254,12 +275,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -285,10 +306,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -299,7 +321,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -322,10 +347,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -336,7 +362,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -358,10 +387,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -386,10 +416,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -414,10 +445,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int s
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -442,10 +474,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -463,17 +496,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int sr
|
||||
char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
|
||||
char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
char8 data = src_data1 | src_data2;
|
||||
char8 data = src_data1 | src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -499,10 +533,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -513,7 +548,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -560,10 +598,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -574,7 +613,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -620,10 +662,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -634,7 +677,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -663,22 +709,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -689,7 +736,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -718,22 +768,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -768,15 +819,16 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -811,17 +863,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int sr
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -855,16 +908,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -890,10 +944,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int s
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -918,10 +973,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int sr
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -945,10 +1001,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -972,10 +1029,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int s
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -999,10 +1057,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
@@ -1029,10 +1088,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int sr
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
|
||||
@@ -43,17 +43,20 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_xor without mask**************************************/
|
||||
__kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -62,7 +65,10 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -112,7 +118,10 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
char4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
char4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -163,7 +172,10 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
ushort4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -215,7 +227,10 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
@@ -43,18 +43,22 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_xor with mask**************************************/
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
|
||||
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data ^ src2_data;
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
|
||||
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -273,13 +294,13 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C1_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -308,12 +329,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -501,11 +533,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C2_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -533,12 +566,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C3_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_bitwise_xor_with_mask_C4_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
|
||||
@@ -42,19 +42,21 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//
|
||||
#if defined (__ATI__)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#elif defined (__NVIDIA__)
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************xor with scalar without mask**************************************/
|
||||
__kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -63,7 +65,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -86,9 +91,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -97,7 +103,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -119,9 +128,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -131,7 +141,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -150,9 +163,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -162,7 +176,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -181,9 +198,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step,
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -202,9 +220,10 @@ __kernel void arithm_s_bitwise_xor_C1_D4 (__global int *src1, int src1_step, i
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -234,9 +253,10 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (__global char *src1, int src1_step,
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -268,7 +289,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -302,7 +327,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (__global ushort *src1, int src1_step
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (__global short *src1, int src1_step,
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_xor_C2_D4 (__global int *src1, int src1_step, i
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (__global char *src1, int src1_step,
|
||||
char8 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, i
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -440,7 +474,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -496,7 +534,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -551,7 +593,10 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -601,7 +647,10 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step,
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step,
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_xor_C3_D4 (__global int *src1, int src1_step, i
|
||||
int tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
int tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_xor_C3_D5 (__global char *src1, int src1_step,
|
||||
char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
|
||||
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_xor_C4_D0 (__global uchar *src1, int src1_step,
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_xor_C4_D1 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_xor_C4_D2 (__global ushort *src1, int src1_step
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_xor_C4_D3 (__global short *src1, int src1_step,
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_xor_C4_D4 (__global int *src1, int src1_step, i
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (__global char *src1, int src1_step,
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -897,11 +956,11 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, i
|
||||
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@@ -42,20 +42,23 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#if defined (__ATI__)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#elif defined (__NVIDIA__)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_xor with scalar with mask**************************************/
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -65,7 +68,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -90,10 +96,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -103,7 +110,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -127,10 +137,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -140,7 +151,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -161,10 +175,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -174,7 +189,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -195,10 +213,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -223,10 +242,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global int *src1, int
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -252,10 +272,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -280,10 +301,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -293,7 +315,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -316,10 +341,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -329,7 +355,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int s
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -351,10 +380,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -378,10 +408,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -405,10 +436,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global short *src1, int
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -432,10 +464,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global int *src1, int sr
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -461,10 +494,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -489,10 +523,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -502,7 +537,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -549,10 +587,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -562,7 +601,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -608,10 +650,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -621,7 +664,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -650,22 +696,23 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -675,7 +722,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@@ -704,22 +754,23 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -753,15 +804,16 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global int *src1, int sr
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -795,16 +847,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global char *src1, int s
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -838,16 +891,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -872,10 +926,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -899,10 +954,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -925,10 +981,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -951,10 +1008,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global short *src1, int
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -977,10 +1035,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global int *src1, int sr
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
@@ -1006,10 +1065,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global char *src1, int s
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
|
||||
@@ -43,7 +43,11 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -51,9 +55,9 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -62,7 +66,10 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -102,9 +109,9 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -114,7 +121,10 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -153,9 +163,9 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -165,7 +175,10 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -207,9 +220,9 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -217,7 +230,10 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -227,7 +243,7 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
@@ -255,9 +271,9 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -265,7 +281,10 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -275,7 +294,8 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
@@ -297,9 +317,9 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int src1_offset,
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -307,7 +327,10 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
@@ -347,9 +370,9 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
|
||||
|
||||
/***********************************Compare GT**************************/
|
||||
__kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -358,7 +381,10 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -397,9 +423,9 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -409,7 +435,10 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -450,9 +479,9 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -462,7 +491,10 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -501,9 +533,9 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -511,7 +543,10 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -521,7 +556,7 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
@@ -550,9 +585,9 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -560,7 +595,10 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -599,9 +637,9 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int src1_offset,
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -609,7 +647,10 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
@@ -649,9 +690,9 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
|
||||
|
||||
/***********************************Compare GE**************************/
|
||||
__kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -660,7 +701,10 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -702,9 +746,9 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -714,7 +758,10 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -757,9 +804,9 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -769,7 +816,10 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -809,9 +859,9 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -820,7 +870,10 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -845,7 +898,7 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
@@ -858,9 +911,9 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -869,7 +922,10 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -909,9 +965,9 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int src1_offset,
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -920,7 +976,10 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
@@ -942,7 +1001,8 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
} uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
|
||||
@@ -43,13 +43,17 @@
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
/***********************************Compare NE*******************************/
|
||||
__kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -58,7 +62,10 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -98,9 +105,9 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -110,7 +117,10 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -150,9 +160,9 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -162,7 +172,10 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -200,9 +213,9 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -210,7 +223,10 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -249,9 +265,9 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -259,7 +275,10 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -269,7 +288,8 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
@@ -282,7 +302,7 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
@@ -296,9 +316,9 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int src1_offset,
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -306,7 +326,10 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
@@ -347,9 +370,9 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
|
||||
|
||||
/***********************************Compare LT*******************************/
|
||||
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -358,7 +381,10 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -398,9 +424,9 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -410,7 +436,10 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -451,9 +480,9 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -463,7 +492,10 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -502,9 +534,9 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -512,7 +544,10 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -554,9 +589,9 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -564,7 +599,10 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -589,7 +627,7 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
@@ -603,9 +641,9 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int src1_offset,
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -613,7 +651,10 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
@@ -638,7 +679,7 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
@@ -653,9 +694,9 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
|
||||
|
||||
/***********************************Compare LE*******************************/
|
||||
__kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -664,7 +705,10 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -705,9 +749,9 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
|
||||
|
||||
|
||||
__kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -717,7 +761,10 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -758,9 +805,9 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
|
||||
|
||||
|
||||
__kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
@@ -770,7 +817,10 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -809,9 +859,9 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -819,7 +869,10 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -857,9 +910,9 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
|
||||
}
|
||||
|
||||
__kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -867,7 +920,10 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
@@ -905,9 +961,9 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int src1_offset,
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
__global double *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@@ -915,7 +971,10 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
|
||||
@@ -44,7 +44,11 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
typedef double F ;
|
||||
typedef double4 F4;
|
||||
#define convert_F4 convert_double4
|
||||
@@ -56,34 +60,24 @@ typedef float4 F4;
|
||||
#define convert_F float
|
||||
#endif
|
||||
|
||||
uchar round2_uchar(F v){
|
||||
|
||||
uchar v1 = convert_uchar_sat(round(v));
|
||||
//uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
|
||||
|
||||
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
|
||||
inline uchar round2_uchar(F v)
|
||||
{
|
||||
return convert_uchar_sat(round(v));
|
||||
}
|
||||
|
||||
ushort round2_ushort(F v){
|
||||
|
||||
ushort v1 = convert_ushort_sat(round(v));
|
||||
//ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
|
||||
|
||||
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
|
||||
inline ushort round2_ushort(F v)
|
||||
{
|
||||
return convert_ushort_sat(round(v));
|
||||
}
|
||||
short round2_short(F v){
|
||||
|
||||
short v1 = convert_short_sat(round(v));
|
||||
//short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
|
||||
|
||||
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
|
||||
inline short round2_short(F v)
|
||||
{
|
||||
return convert_short_sat(round(v));
|
||||
}
|
||||
int round2_int(F v){
|
||||
|
||||
int v1 = convert_int_sat(round(v));
|
||||
//int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
|
||||
|
||||
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
|
||||
inline int round2_int(F v)
|
||||
{
|
||||
return convert_int_sat(round(v));
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////divide///////////////////////////////////////////////////
|
||||
@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1, F scalar)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int2 coor = (int2)(get_global_id(0), get_global_id(1));
|
||||
|
||||
if (x < cols && y < rows)
|
||||
if (coor.x < cols && coor.y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
coor.x = coor.x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
|
||||
mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int4 dst_args = (int4)(mad24(coor.y, dst_step, dst_offset),
|
||||
mad24(coor.y, dst_step, dst_offset + dst_step1),
|
||||
mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
|
||||
0);
|
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index);
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 src1_data = vload4(0, src1 + src_index.x);
|
||||
uchar4 src2_data = vload4(0, src2 + src_index.y);
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_args.z));
|
||||
|
||||
F4 tmp = convert_F4(src1_data) * scalar;
|
||||
|
||||
uchar4 tmp_data;
|
||||
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x);
|
||||
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y);
|
||||
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z);
|
||||
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w);
|
||||
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
|
||||
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
|
||||
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
|
||||
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = dst_data;
|
||||
*((__global uchar4 *)(dst + dst_args.z)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -142,7 +138,10 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -182,7 +181,10 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -297,7 +299,10 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src_index = mad24(y, src_step, x + src_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -333,7 +338,10 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
@@ -368,7 +376,10 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
|
||||
@@ -44,7 +44,11 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -61,7 +65,10 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
|
||||
|
||||
@@ -116,7 +123,10 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
|
||||
|
||||
@@ -158,7 +168,10 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset >> 1) & 3) << 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset >> 1) & 3) << 1)
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
|
||||
|
||||
@@ -200,7 +213,10 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (((dst_offset >> 1) & 3) << 1)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset >> 1) & 3) << 1)
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
|
||||
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Dachuan Zhao, dachuan@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@@ -44,11 +43,16 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined DOUBLE_SUPPORT
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int4 round_int4(float4 v){
|
||||
int4 round_int4(float4 v)
|
||||
{
|
||||
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
|
||||
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
|
||||
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
|
||||
@@ -56,7 +60,8 @@ int4 round_int4(float4 v){
|
||||
|
||||
return convert_int4_sat(v);
|
||||
}
|
||||
uint4 round_uint4(float4 v){
|
||||
uint4 round_uint4(float4 v)
|
||||
{
|
||||
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
|
||||
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
|
||||
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
|
||||
@@ -64,7 +69,8 @@ uint4 round_uint4(float4 v){
|
||||
|
||||
return convert_uint4_sat(v);
|
||||
}
|
||||
long round_int(float v){
|
||||
long round_int(float v)
|
||||
{
|
||||
v = v + (v > 0 ? 0.5 : -0.5);
|
||||
|
||||
return convert_int_sat(v);
|
||||
@@ -85,7 +91,10 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
@@ -130,7 +139,10 @@ __kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offs
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -166,7 +178,10 @@ __kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offse
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
@@ -263,8 +278,8 @@ __kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offs
|
||||
#endif
|
||||
|
||||
__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1, float scalar)
|
||||
__global float *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1, float scalar)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
380
modules/ocl/src/opencl/stereobp.cl
Normal file
380
modules/ocl/src/opencl/stereobp.cl
Normal file
@@ -0,0 +1,380 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef T_FLOAT
|
||||
#define T float
|
||||
#else
|
||||
#define T short
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////common///////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////
|
||||
T saturate_cast(float v){
|
||||
#ifdef T_SHORT
|
||||
return convert_short_sat_rte(v);
|
||||
#else
|
||||
return v;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define FLOAT_MAX 3.402823466e+38f
|
||||
typedef struct
|
||||
{
|
||||
int cndisp;
|
||||
float cmax_data_term;
|
||||
float cdata_weight;
|
||||
float cmax_disc_term;
|
||||
float cdisc_single_jump;
|
||||
}con_srtuct_t;
|
||||
///////////////////////////////////////////////////////////////
|
||||
////////////////////////// comp data //////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
float pix_diff_1(__global const uchar *ls, __global const uchar *rs)
|
||||
{
|
||||
return abs((int)(*ls) - *rs);
|
||||
}
|
||||
|
||||
float pix_diff_3(__global const uchar *ls, __global const uchar *rs)
|
||||
{
|
||||
const float tr = 0.299f;
|
||||
const float tg = 0.587f;
|
||||
const float tb = 0.114f;
|
||||
|
||||
float val;
|
||||
|
||||
val = tb * abs((int)ls[0] - rs[0]);
|
||||
val += tg * abs((int)ls[1] - rs[1]);
|
||||
val += tr * abs((int)ls[2] - rs[2]);
|
||||
|
||||
return val;
|
||||
}
|
||||
float pix_diff_4(__global const uchar *ls, __global const uchar *rs)
|
||||
{
|
||||
uchar4 l, r;
|
||||
l = *((__global uchar4 *)ls);
|
||||
r = *((__global uchar4 *)rs);
|
||||
|
||||
const float tr = 0.299f;
|
||||
const float tg = 0.587f;
|
||||
const float tb = 0.114f;
|
||||
|
||||
float val;
|
||||
|
||||
val = tb * abs((int)l.x - r.x);
|
||||
val += tg * abs((int)l.y - r.y);
|
||||
val += tr * abs((int)l.z - r.z);
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
|
||||
#ifndef CN
|
||||
#define CN 4
|
||||
#endif
|
||||
|
||||
#define CAT(X,Y) X##Y
|
||||
#define CAT2(X,Y) CAT(X,Y)
|
||||
|
||||
#define PIX_DIFF CAT2(pix_diff_, CN)
|
||||
|
||||
__kernel void comp_data(__global uchar *left, int left_rows, int left_cols, int left_step,
|
||||
__global uchar *right, int right_step,
|
||||
__global T *data, int data_step,
|
||||
__constant con_srtuct_t *con_st)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (y > 0 && y < (left_rows - 1) && x > 0 && x < (left_cols - 1))
|
||||
{
|
||||
data_step /= sizeof(T);
|
||||
const __global uchar* ls = left + y * left_step + x * CN;
|
||||
const __global uchar* rs = right + y * right_step + x * CN;
|
||||
|
||||
__global T *ds = data + y * data_step + x;
|
||||
|
||||
const unsigned int disp_step = data_step * left_rows;
|
||||
|
||||
for (int disp = 0; disp < con_st -> cndisp; disp++)
|
||||
{
|
||||
if (x - disp >= 1)
|
||||
{
|
||||
float val = 0;
|
||||
val = PIX_DIFF(ls, rs - disp * CN);
|
||||
ds[disp * disp_step] = saturate_cast(fmin(con_st -> cdata_weight * val,
|
||||
con_st -> cdata_weight * con_st -> cmax_data_term));
|
||||
}
|
||||
else
|
||||
{
|
||||
ds[disp * disp_step] = saturate_cast(con_st -> cdata_weight * con_st -> cmax_data_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////////// data step down ///////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
__kernel void data_step_down(__global T *src, int src_rows,
|
||||
__global T *dst, int dst_rows, int dst_cols,
|
||||
int src_step, int dst_step,
|
||||
int cndisp)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
{
|
||||
src_step /= sizeof(T);
|
||||
dst_step /= sizeof(T);
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
{
|
||||
float dst_reg;
|
||||
dst_reg = src[(d * src_rows + (2*y+0)) * src_step + 2*x+0];
|
||||
dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+0];
|
||||
dst_reg += src[(d * src_rows + (2*y+0)) * src_step + 2*x+1];
|
||||
dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+1];
|
||||
|
||||
dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////// level up messages ////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
__kernel void level_up_message(__global T *src, int src_rows, int src_step,
|
||||
__global T *dst, int dst_rows, int dst_cols, int dst_step,
|
||||
int cndisp)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
{
|
||||
src_step /= sizeof(T);
|
||||
dst_step /= sizeof(T);
|
||||
|
||||
const int dst_disp_step = dst_step * dst_rows;
|
||||
const int src_disp_step = src_step * src_rows;
|
||||
|
||||
__global T *dstr = dst + y * dst_step + x;
|
||||
__global const T *srcr = src + (y / 2 * src_step) + (x / 2);
|
||||
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////// calc all iterations /////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
void calc_min_linear_penalty(__global T * dst, int disp_step,
|
||||
int cndisp, float cdisc_single_jump)
|
||||
{
|
||||
float prev = dst[0];
|
||||
float cur;
|
||||
|
||||
for (int disp = 1; disp < cndisp; ++disp)
|
||||
{
|
||||
prev += cdisc_single_jump;
|
||||
cur = dst[disp_step * disp];
|
||||
|
||||
if (prev < cur)
|
||||
{
|
||||
cur = prev;
|
||||
dst[disp_step * disp] = saturate_cast(prev);
|
||||
}
|
||||
|
||||
prev = cur;
|
||||
}
|
||||
|
||||
prev = dst[(cndisp - 1) * disp_step];
|
||||
for (int disp = cndisp - 2; disp >= 0; disp--)
|
||||
{
|
||||
prev += cdisc_single_jump;
|
||||
cur = dst[disp_step * disp];
|
||||
|
||||
if (prev < cur)
|
||||
{
|
||||
cur = prev;
|
||||
dst[disp_step * disp] = saturate_cast(prev);
|
||||
}
|
||||
prev = cur;
|
||||
}
|
||||
}
|
||||
void message(const __global T *msg1, const __global T *msg2,
|
||||
const __global T *msg3, const __global T *data, __global T *dst,
|
||||
int msg_disp_step, int data_disp_step, int cndisp, float cmax_disc_term, float cdisc_single_jump)
|
||||
{
|
||||
float minimum = FLOAT_MAX;
|
||||
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
{
|
||||
float dst_reg;
|
||||
dst_reg = msg1[msg_disp_step * i];
|
||||
dst_reg += msg2[msg_disp_step * i];
|
||||
dst_reg += msg3[msg_disp_step * i];
|
||||
dst_reg += data[data_disp_step * i];
|
||||
|
||||
if (dst_reg < minimum)
|
||||
minimum = dst_reg;
|
||||
|
||||
dst[msg_disp_step * i] = saturate_cast(dst_reg);
|
||||
}
|
||||
|
||||
calc_min_linear_penalty(dst, msg_disp_step, cndisp, cdisc_single_jump);
|
||||
|
||||
minimum += cmax_disc_term;
|
||||
|
||||
float sum = 0;
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
{
|
||||
float dst_reg = dst[msg_disp_step * i];
|
||||
if (dst_reg > minimum)
|
||||
{
|
||||
dst_reg = minimum;
|
||||
dst[msg_disp_step * i] = saturate_cast(minimum);
|
||||
}
|
||||
sum += dst_reg;
|
||||
}
|
||||
sum /= cndisp;
|
||||
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
dst[msg_disp_step * i] -= sum;
|
||||
}
|
||||
__kernel void one_iteration(__global T *u, int u_step,
|
||||
__global T *data, int data_step,
|
||||
__global T *d, __global T *l, __global T *r,
|
||||
int t, int cols, int rows,
|
||||
int cndisp, float cmax_disc_term, float cdisc_single_jump)
|
||||
{
|
||||
const int y = get_global_id(1);
|
||||
const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
|
||||
|
||||
if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
|
||||
{
|
||||
u_step /= sizeof(T);
|
||||
data_step /= sizeof(T);
|
||||
|
||||
__global T *us = u + y * u_step + x;
|
||||
__global T *ds = d + y * u_step + x;
|
||||
__global T *ls = l + y * u_step + x;
|
||||
__global T *rs = r + y * u_step + x;
|
||||
const __global T *dt = data + y * data_step + x;
|
||||
|
||||
int msg_disp_step = u_step * rows;
|
||||
int data_disp_step = data_step * rows;
|
||||
|
||||
message(us + u_step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step, cndisp,
|
||||
cmax_disc_term, cdisc_single_jump);
|
||||
message(ds - u_step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step, cndisp,
|
||||
cmax_disc_term, cdisc_single_jump);
|
||||
|
||||
message(us + u_step, ds - u_step, rs - 1, dt, rs, msg_disp_step, data_disp_step, cndisp,
|
||||
cmax_disc_term, cdisc_single_jump);
|
||||
message(us + u_step, ds - u_step, ls + 1, dt, ls, msg_disp_step, data_disp_step, cndisp,
|
||||
cmax_disc_term, cdisc_single_jump);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////////////// output ////////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
__kernel void output(const __global T *u, int u_step,
|
||||
const __global T *d, const __global T *l,
|
||||
const __global T *r, const __global T *data,
|
||||
__global T *disp, int disp_rows, int disp_cols, int disp_step,
|
||||
int cndisp)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
|
||||
{
|
||||
u_step /= sizeof(T);
|
||||
disp_step /= sizeof(T);
|
||||
const __global T *us = u + (y + 1) * u_step + x;
|
||||
const __global T *ds = d + (y - 1) * u_step + x;
|
||||
const __global T *ls = l + y * u_step + (x + 1);
|
||||
const __global T *rs = r + y * u_step + (x - 1);
|
||||
const __global T *dt = data + y * u_step + x;
|
||||
|
||||
int disp_steps = disp_rows * u_step;
|
||||
|
||||
int best = 0;
|
||||
float best_val = FLOAT_MAX;
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
{
|
||||
float val;
|
||||
val = us[d * disp_steps];
|
||||
val += ds[d * disp_steps];
|
||||
val += ls[d * disp_steps];
|
||||
val += rs[d * disp_steps];
|
||||
val += dt[d * disp_steps];
|
||||
|
||||
if (val < best_val)
|
||||
{
|
||||
best_val = val;
|
||||
best = d;
|
||||
}
|
||||
}
|
||||
|
||||
(disp + y * disp_step)[x] = convert_short_sat(best);
|
||||
}
|
||||
}
|
||||
517
modules/ocl/src/stereobp.cpp
Normal file
517
modules/ocl/src/stereobp.cpp
Normal file
@@ -0,0 +1,517 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
///////////////// stereoBP /////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
|
||||
///////////////////////////OpenCL kernel strings///////////////////////////
|
||||
extern const char *stereobp;
|
||||
}
|
||||
|
||||
}
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
namespace stereoBP
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////common////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
typedef struct
|
||||
{
|
||||
int cndisp;
|
||||
float cmax_data_term;
|
||||
float cdata_weight;
|
||||
float cmax_disc_term;
|
||||
float cdisc_single_jump;
|
||||
} con_struct_t;
|
||||
|
||||
cl_mem cl_con_struct = NULL;
|
||||
static void load_constants(int ndisp, float max_data_term, float data_weight,
|
||||
float max_disc_term, float disc_single_jump)
|
||||
{
|
||||
con_struct_t *con_struct = new con_struct_t;
|
||||
con_struct -> cndisp = ndisp;
|
||||
con_struct -> cmax_data_term = max_data_term;
|
||||
con_struct -> cdata_weight = data_weight;
|
||||
con_struct -> cmax_disc_term = max_disc_term;
|
||||
con_struct -> cdisc_single_jump = disc_single_jump;
|
||||
|
||||
cl_con_struct = load_constant(*((cl_context*)getoclContext()), *((cl_command_queue*)getoclCommandQueue()), (void *)con_struct,
|
||||
sizeof(con_struct_t));
|
||||
|
||||
delete con_struct;
|
||||
}
|
||||
static void release_constants()
|
||||
{
|
||||
openCLFree(cl_con_struct);
|
||||
}
|
||||
static inline int divUp(int total, int grain)
|
||||
{
|
||||
return (total + grain - 1) / grain;
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////comp data////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
static void comp_data_call(const oclMat &left, const oclMat &right, oclMat &data, int /*disp*/,
|
||||
float /*cmax_data_term*/, float /*cdata_weight*/)
|
||||
{
|
||||
Context *clCxt = left.clCxt;
|
||||
int channels = left.oclchannels();
|
||||
int data_type = data.type();
|
||||
|
||||
String kernelName = "comp_data";
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&left.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.cols));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&right.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&right.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&data.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&cl_con_struct));
|
||||
|
||||
size_t gt[3] = {left.cols, left.rows, 1}, lt[3] = {16, 16, 1};
|
||||
|
||||
const int OPT_SIZE = 50;
|
||||
char cn_opt [OPT_SIZE] = "";
|
||||
sprintf( cn_opt, "%s -D CN=%d",
|
||||
(data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT"),
|
||||
channels
|
||||
);
|
||||
openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, cn_opt);
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////data set down////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
static void data_step_down_call(int dst_cols, int dst_rows, int src_rows,
|
||||
const oclMat &src, oclMat &dst, int disp)
|
||||
{
|
||||
Context *clCxt = src.clCxt;
|
||||
int data_type = src.type();
|
||||
|
||||
String kernelName = "data_step_down";
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_cols));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp));
|
||||
|
||||
size_t gt[3] = {dst_cols, dst_rows, 1}, lt[3] = {16, 16, 1};
|
||||
const char* t_opt = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
|
||||
openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////live up message////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
static void level_up_message_call(int dst_cols, int dst_rows, int src_rows,
|
||||
oclMat &src, oclMat &dst, int ndisp)
|
||||
{
|
||||
Context *clCxt = src.clCxt;
|
||||
int data_type = src.type();
|
||||
|
||||
String kernelName = "level_up_message";
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_cols));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&ndisp));
|
||||
|
||||
size_t gt[3] = {dst_cols, dst_rows, 1}, lt[3] = {16, 16, 1};
|
||||
const char* t_opt = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
|
||||
openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
|
||||
}
|
||||
static void level_up_messages_calls(int dst_idx, int dst_cols, int dst_rows, int src_rows,
|
||||
oclMat *mus, oclMat *mds, oclMat *mls, oclMat *mrs,
|
||||
int ndisp)
|
||||
{
|
||||
int src_idx = (dst_idx + 1) & 1;
|
||||
|
||||
level_up_message_call(dst_cols, dst_rows, src_rows,
|
||||
mus[src_idx], mus[dst_idx], ndisp);
|
||||
|
||||
level_up_message_call(dst_cols, dst_rows, src_rows,
|
||||
mds[src_idx], mds[dst_idx], ndisp);
|
||||
|
||||
level_up_message_call(dst_cols, dst_rows, src_rows,
|
||||
mls[src_idx], mls[dst_idx], ndisp);
|
||||
|
||||
level_up_message_call(dst_cols, dst_rows, src_rows,
|
||||
mrs[src_idx], mrs[dst_idx], ndisp);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////cals_all_iterations_call///////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
static void calc_all_iterations_call(int cols, int rows, oclMat &u, oclMat &d,
|
||||
oclMat &l, oclMat &r, oclMat &data,
|
||||
int t, int cndisp, float cmax_disc_term,
|
||||
float cdisc_single_jump)
|
||||
{
|
||||
Context *clCxt = l.clCxt;
|
||||
int data_type = u.type();
|
||||
|
||||
String kernelName = "one_iteration";
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&u.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&u.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&data.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&d.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&l.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&r.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cndisp));
|
||||
args.push_back( std::make_pair( sizeof(cl_float) , (void *)&cmax_disc_term));
|
||||
args.push_back( std::make_pair( sizeof(cl_float) , (void *)&cdisc_single_jump));
|
||||
|
||||
size_t gt[3] = {cols, rows, 1}, lt[3] = {16, 16, 1};
|
||||
const char* t_opt = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
|
||||
openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
|
||||
}
|
||||
|
||||
static void calc_all_iterations_calls(int cols, int rows, int iters, oclMat &u,
|
||||
oclMat &d, oclMat &l, oclMat &r,
|
||||
oclMat &data, int cndisp, float cmax_disc_term,
|
||||
float cdisc_single_jump)
|
||||
{
|
||||
for(int t = 0; t < iters; ++t)
|
||||
calc_all_iterations_call(cols, rows, u, d, l, r, data, t, cndisp,
|
||||
cmax_disc_term, cdisc_single_jump);
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////output///////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static void output_call(const oclMat &u, const oclMat &d, const oclMat l, const oclMat &r,
|
||||
const oclMat &data, oclMat &disp, int ndisp)
|
||||
{
|
||||
Context *clCxt = u.clCxt;
|
||||
int data_type = u.type();
|
||||
|
||||
String kernelName = "output";
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&u.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&u.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&d.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&l.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&r.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&disp.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.cols));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&ndisp));
|
||||
|
||||
size_t gt[3] = {disp.cols, disp.rows, 1}, lt[3] = {16, 16, 1};
|
||||
const char* t_opt = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
|
||||
openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
namespace
|
||||
{
|
||||
const float DEFAULT_MAX_DATA_TERM = 10.0f;
|
||||
const float DEFAULT_DATA_WEIGHT = 0.07f;
|
||||
const float DEFAULT_MAX_DISC_TERM = 1.7f;
|
||||
const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
|
||||
}
|
||||
|
||||
void cv::ocl::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels)
|
||||
{
|
||||
ndisp = width / 4;
|
||||
if ((ndisp & 1) != 0)
|
||||
ndisp++;
|
||||
|
||||
int mm = ::max(width, height);
|
||||
iters = mm / 100 + 2;
|
||||
|
||||
levels = (int)(::log(static_cast<double>(mm)) + 1) * 4 / 5;
|
||||
if (levels == 0) levels++;
|
||||
}
|
||||
|
||||
cv::ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, int msg_type_)
|
||||
: ndisp(ndisp_), iters(iters_), levels(levels_),
|
||||
max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
|
||||
max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP),
|
||||
msg_type(msg_type_), datas(levels_)
|
||||
{
|
||||
}
|
||||
|
||||
cv::ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, int msg_type_)
|
||||
: ndisp(ndisp_), iters(iters_), levels(levels_),
|
||||
max_data_term(max_data_term_), data_weight(data_weight_),
|
||||
max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_),
|
||||
msg_type(msg_type_), datas(levels_)
|
||||
{
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
class StereoBeliefPropagationImpl
|
||||
{
|
||||
public:
|
||||
StereoBeliefPropagationImpl(StereoBeliefPropagation &rthis_,
|
||||
oclMat &u_, oclMat &d_, oclMat &l_, oclMat &r_,
|
||||
oclMat &u2_, oclMat &d2_, oclMat &l2_, oclMat &r2_,
|
||||
std::vector<oclMat> &datas_, oclMat &out_)
|
||||
: rthis(rthis_), u(u_), d(d_), l(l_), r(r_), u2(u2_), d2(d2_), l2(l2_), r2(r2_), datas(datas_), out(out_),
|
||||
zero(Scalar::all(0)), scale(rthis_.msg_type == CV_32F ? 1.0f : 10.0f)
|
||||
{
|
||||
CV_Assert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels);
|
||||
CV_Assert(rthis.msg_type == CV_32F || rthis.msg_type == CV_16S);
|
||||
CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < std::numeric_limits<short>::max());
|
||||
}
|
||||
|
||||
void operator()(const oclMat &left, const oclMat &right, oclMat &disp)
|
||||
{
|
||||
CV_Assert(left.size() == right.size() && left.type() == right.type());
|
||||
CV_Assert(left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4);
|
||||
|
||||
rows = left.rows;
|
||||
cols = left.cols;
|
||||
|
||||
int divisor = (int)pow(2.f, rthis.levels - 1.0f);
|
||||
int lowest_cols = cols / divisor;
|
||||
int lowest_rows = rows / divisor;
|
||||
const int min_image_dim_size = 2;
|
||||
CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size);
|
||||
|
||||
init();
|
||||
|
||||
datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type);
|
||||
datas[0].setTo(Scalar_<short>::all(0));
|
||||
|
||||
cv::ocl::stereoBP::comp_data_call(left, right, datas[0], rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight);
|
||||
calcBP(disp);
|
||||
}
|
||||
|
||||
void operator()(const oclMat &data, oclMat &disp)
|
||||
{
|
||||
CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0));
|
||||
|
||||
rows = data.rows / rthis.ndisp;
|
||||
cols = data.cols;
|
||||
|
||||
int divisor = (int)pow(2.f, rthis.levels - 1.0f);
|
||||
int lowest_cols = cols / divisor;
|
||||
int lowest_rows = rows / divisor;
|
||||
const int min_image_dim_size = 2;
|
||||
CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size);
|
||||
|
||||
init();
|
||||
|
||||
datas[0] = data;
|
||||
|
||||
calcBP(disp);
|
||||
}
|
||||
private:
|
||||
void init()
|
||||
{
|
||||
u.create(rows * rthis.ndisp, cols, rthis.msg_type);
|
||||
d.create(rows * rthis.ndisp, cols, rthis.msg_type);
|
||||
l.create(rows * rthis.ndisp, cols, rthis.msg_type);
|
||||
r.create(rows * rthis.ndisp, cols, rthis.msg_type);
|
||||
|
||||
if (rthis.levels & 1)
|
||||
{
|
||||
//can clear less area
|
||||
u = zero;
|
||||
d = zero;
|
||||
l = zero;
|
||||
r = zero;
|
||||
}
|
||||
|
||||
if (rthis.levels > 1)
|
||||
{
|
||||
int less_rows = (rows + 1) / 2;
|
||||
int less_cols = (cols + 1) / 2;
|
||||
|
||||
u2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
|
||||
d2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
|
||||
l2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
|
||||
r2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
|
||||
|
||||
if ((rthis.levels & 1) == 0)
|
||||
{
|
||||
u2 = zero;
|
||||
d2 = zero;
|
||||
l2 = zero;
|
||||
r2 = zero;
|
||||
}
|
||||
}
|
||||
|
||||
cv::ocl::stereoBP::load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight,
|
||||
scale * rthis.max_disc_term, scale * rthis.disc_single_jump);
|
||||
|
||||
datas.resize(rthis.levels);
|
||||
cols_all.resize(rthis.levels);
|
||||
rows_all.resize(rthis.levels);
|
||||
|
||||
cols_all[0] = cols;
|
||||
rows_all[0] = rows;
|
||||
}
|
||||
|
||||
void calcBP(oclMat &disp)
|
||||
{
|
||||
using namespace cv::ocl::stereoBP;
|
||||
|
||||
for (int i = 1; i < rthis.levels; ++i)
|
||||
{
|
||||
cols_all[i] = (cols_all[i - 1] + 1) / 2;
|
||||
rows_all[i] = (rows_all[i - 1] + 1) / 2;
|
||||
|
||||
datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type);
|
||||
datas[i].setTo(Scalar_<short>::all(0));
|
||||
|
||||
data_step_down_call(cols_all[i], rows_all[i], rows_all[i - 1],
|
||||
datas[i - 1], datas[i], rthis.ndisp);
|
||||
}
|
||||
|
||||
oclMat mus[] = {u, u2};
|
||||
oclMat mds[] = {d, d2};
|
||||
oclMat mrs[] = {r, r2};
|
||||
oclMat mls[] = {l, l2};
|
||||
|
||||
int mem_idx = (rthis.levels & 1) ? 0 : 1;
|
||||
|
||||
for (int i = rthis.levels - 1; i >= 0; --i)
|
||||
{
|
||||
// for lower level we have already computed messages by setting to zero
|
||||
if (i != rthis.levels - 1)
|
||||
level_up_messages_calls(mem_idx, cols_all[i], rows_all[i], rows_all[i + 1],
|
||||
mus, mds, mls, mrs, rthis.ndisp);
|
||||
|
||||
calc_all_iterations_calls(cols_all[i], rows_all[i], rthis.iters, mus[mem_idx],
|
||||
mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i],
|
||||
rthis.ndisp, scale * rthis.max_disc_term,
|
||||
scale * rthis.disc_single_jump);
|
||||
|
||||
mem_idx = (mem_idx + 1) & 1;
|
||||
}
|
||||
if (disp.empty())
|
||||
disp.create(rows, cols, CV_16S);
|
||||
|
||||
out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
|
||||
out = zero;
|
||||
|
||||
output_call(u, d, l, r, datas.front(), out, rthis.ndisp);
|
||||
|
||||
if (disp.type() != CV_16S)
|
||||
out.convertTo(disp, disp.type());
|
||||
|
||||
release_constants();
|
||||
}
|
||||
StereoBeliefPropagationImpl& operator=(const StereoBeliefPropagationImpl&);
|
||||
|
||||
StereoBeliefPropagation &rthis;
|
||||
|
||||
oclMat &u;
|
||||
oclMat &d;
|
||||
oclMat &l;
|
||||
oclMat &r;
|
||||
|
||||
oclMat &u2;
|
||||
oclMat &d2;
|
||||
oclMat &l2;
|
||||
oclMat &r2;
|
||||
|
||||
std::vector<oclMat> &datas;
|
||||
oclMat &out;
|
||||
|
||||
const Scalar zero;
|
||||
const float scale;
|
||||
|
||||
int rows, cols;
|
||||
|
||||
std::vector<int> cols_all, rows_all;
|
||||
};
|
||||
}
|
||||
|
||||
void cv::ocl::StereoBeliefPropagation::operator()(const oclMat &left, const oclMat &right, oclMat &disp)
|
||||
{
|
||||
::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
|
||||
impl(left, right, disp);
|
||||
}
|
||||
|
||||
void cv::ocl::StereoBeliefPropagation::operator()(const oclMat &data, oclMat &disp)
|
||||
{
|
||||
::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
|
||||
impl(data, disp);
|
||||
}
|
||||
Reference in New Issue
Block a user