other kernels now use row scheme
This commit is contained in:
@@ -3094,11 +3094,12 @@ static InRangeFunc getInRangeFunc(int depth)
|
|||||||
static bool ocl_inRange( InputArray _src, InputArray _lowerb,
|
static bool ocl_inRange( InputArray _src, InputArray _lowerb,
|
||||||
InputArray _upperb, OutputArray _dst )
|
InputArray _upperb, OutputArray _dst )
|
||||||
{
|
{
|
||||||
|
const ocl::Device & d = ocl::Device::getDefault();
|
||||||
int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
|
int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
|
||||||
Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
|
Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
|
||||||
int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
|
int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
|
||||||
int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
|
int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
|
||||||
int cn = CV_MAT_CN(stype);
|
int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1;
|
||||||
bool lbScalar = false, ubScalar = false;
|
bool lbScalar = false, ubScalar = false;
|
||||||
|
|
||||||
if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
|
if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
|
||||||
@@ -3122,7 +3123,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
|
|||||||
if (lbScalar != ubScalar)
|
if (lbScalar != ubScalar)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
|
bool doubleSupport = d.doubleFPConfig() > 0,
|
||||||
haveScalar = lbScalar && ubScalar;
|
haveScalar = lbScalar && ubScalar;
|
||||||
|
|
||||||
if ( (!doubleSupport && sdepth == CV_64F) ||
|
if ( (!doubleSupport && sdepth == CV_64F) ||
|
||||||
@@ -3187,13 +3188,13 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
|
|||||||
uscalar.copyTo(uscalaru);
|
uscalar.copyTo(uscalaru);
|
||||||
|
|
||||||
ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
|
ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
|
||||||
ocl::KernelArg::PtrReadOnly(uscalaru));
|
ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
|
ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
|
||||||
ocl::KernelArg::ReadOnlyNoSize(uscalaru));
|
ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);
|
||||||
|
|
||||||
size_t globalsize[2] = { ssize.width, ssize.height };
|
size_t globalsize[2] = { ssize.width, (ssize.height + rowsPerWI - 1) / rowsPerWI };
|
||||||
return ker.run(2, globalsize, NULL, false);
|
return ker.run(2, globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -270,21 +270,22 @@ namespace cv {
|
|||||||
|
|
||||||
static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
|
static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
|
||||||
{
|
{
|
||||||
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
|
||||||
|
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
|
||||||
|
|
||||||
String dstargs, dstdecl, processelem;
|
String dstargs, processelem, indexdecl;
|
||||||
for (int i = 0; i < cn; ++i)
|
for (int i = 0; i < cn; ++i)
|
||||||
{
|
{
|
||||||
dstargs += format("DECLARE_DST_PARAM(%d)", i);
|
dstargs += format("DECLARE_DST_PARAM(%d)", i);
|
||||||
dstdecl += format("DECLARE_DATA(%d)", i);
|
indexdecl += format("DECLARE_INDEX(%d)", i);
|
||||||
processelem += format("PROCESS_ELEM(%d)", i);
|
processelem += format("PROCESS_ELEM(%d)", i);
|
||||||
}
|
}
|
||||||
|
|
||||||
ocl::Kernel k("split", ocl::core::split_merge_oclsrc,
|
ocl::Kernel k("split", ocl::core::split_merge_oclsrc,
|
||||||
format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s "
|
format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s"
|
||||||
"-D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s",
|
" -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s",
|
||||||
ocl::memopTypeToStr(depth), cn, dstargs.c_str(),
|
ocl::memopTypeToStr(depth), cn, dstargs.c_str(),
|
||||||
dstdecl.c_str(), processelem.c_str()));
|
processelem.c_str(), indexdecl.c_str()));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@@ -299,8 +300,9 @@ static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
|
|||||||
int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
|
int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
|
||||||
for (int i = 0; i < cn; ++i)
|
for (int i = 0; i < cn; ++i)
|
||||||
argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i]));
|
argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i]));
|
||||||
|
k.set(argidx, rowsPerWI);
|
||||||
|
|
||||||
size_t globalsize[2] = { size.width, size.height };
|
size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
|
||||||
return k.run(2, globalsize, NULL, false);
|
return k.run(2, globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -419,7 +421,8 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
|
|||||||
_mv.getUMatVector(src);
|
_mv.getUMatVector(src);
|
||||||
CV_Assert(!src.empty());
|
CV_Assert(!src.empty());
|
||||||
|
|
||||||
int type = src[0].type(), depth = CV_MAT_DEPTH(type);
|
int type = src[0].type(), depth = CV_MAT_DEPTH(type),
|
||||||
|
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
|
||||||
Size size = src[0].size();
|
Size size = src[0].size();
|
||||||
|
|
||||||
for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
|
for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
|
||||||
@@ -440,20 +443,20 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
|
|||||||
}
|
}
|
||||||
int dcn = (int)ksrc.size();
|
int dcn = (int)ksrc.size();
|
||||||
|
|
||||||
String srcargs, srcdecl, processelem, cndecl;
|
String srcargs, processelem, cndecl, indexdecl;
|
||||||
for (int i = 0; i < dcn; ++i)
|
for (int i = 0; i < dcn; ++i)
|
||||||
{
|
{
|
||||||
srcargs += format("DECLARE_SRC_PARAM(%d)", i);
|
srcargs += format("DECLARE_SRC_PARAM(%d)", i);
|
||||||
srcdecl += format("DECLARE_DATA(%d)", i);
|
|
||||||
processelem += format("PROCESS_ELEM(%d)", i);
|
processelem += format("PROCESS_ELEM(%d)", i);
|
||||||
|
indexdecl += format("DECLARE_INDEX(%d)", i);
|
||||||
cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
|
cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
|
||||||
}
|
}
|
||||||
|
|
||||||
ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
|
ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
|
||||||
format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
|
format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
|
||||||
" -D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s%s",
|
" -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s",
|
||||||
dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
|
dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
|
||||||
srcdecl.c_str(), processelem.c_str(), cndecl.c_str()));
|
indexdecl.c_str(), processelem.c_str(), cndecl.c_str()));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@@ -463,9 +466,10 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
|
|||||||
int argidx = 0;
|
int argidx = 0;
|
||||||
for (int i = 0; i < dcn; ++i)
|
for (int i = 0; i < dcn; ++i)
|
||||||
argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
|
argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
|
||||||
k.set(argidx, ocl::KernelArg::WriteOnly(dst));
|
argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst));
|
||||||
|
k.set(argidx, rowsPerWI);
|
||||||
|
|
||||||
size_t globalsize[2] = { dst.cols, dst.rows };
|
size_t globalsize[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
|
||||||
return k.run(2, globalsize, NULL, false);
|
return k.run(2, globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -690,7 +694,7 @@ static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _d
|
|||||||
for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
|
for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
|
||||||
CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
|
CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
|
||||||
|
|
||||||
String declsrc, decldst, declproc, declcn;
|
String declsrc, decldst, declproc, declcn, indexdecl;
|
||||||
std::vector<UMat> srcargs(npairs), dstargs(npairs);
|
std::vector<UMat> srcargs(npairs), dstargs(npairs);
|
||||||
|
|
||||||
for (size_t i = 0; i < npairs; ++i)
|
for (size_t i = 0; i < npairs; ++i)
|
||||||
@@ -711,14 +715,16 @@ static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _d
|
|||||||
|
|
||||||
declsrc += format("DECLARE_INPUT_MAT(%d)", i);
|
declsrc += format("DECLARE_INPUT_MAT(%d)", i);
|
||||||
decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
|
decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
|
||||||
|
indexdecl += format("DECLARE_INDEX(%d)", i);
|
||||||
declproc += format("PROCESS_ELEM(%d)", i);
|
declproc += format("PROCESS_ELEM(%d)", i);
|
||||||
declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
|
declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
|
||||||
}
|
}
|
||||||
|
|
||||||
ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
|
ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
|
||||||
format("-D T=%s -D DECLARE_INPUT_MATS=%s -D DECLARE_OUTPUT_MATS=%s"
|
format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s"
|
||||||
" -D PROCESS_ELEMS=%s%s", ocl::memopTypeToStr(depth),
|
" -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s",
|
||||||
declsrc.c_str(), decldst.c_str(), declproc.c_str(), declcn.c_str()));
|
ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(),
|
||||||
|
declproc.c_str(), indexdecl.c_str(), declcn.c_str()));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@@ -2489,7 +2489,8 @@ namespace cv {
|
|||||||
static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB,
|
static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB,
|
||||||
OutputArray _dst, int flags, bool conjB )
|
OutputArray _dst, int flags, bool conjB )
|
||||||
{
|
{
|
||||||
int atype = _srcA.type(), btype = _srcB.type();
|
int atype = _srcA.type(), btype = _srcB.type(),
|
||||||
|
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
|
||||||
Size asize = _srcA.size(), bsize = _srcB.size();
|
Size asize = _srcA.size(), bsize = _srcB.size();
|
||||||
CV_Assert(asize == bsize);
|
CV_Assert(asize == bsize);
|
||||||
|
|
||||||
@@ -2509,9 +2510,9 @@ static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB,
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
k.args(ocl::KernelArg::ReadOnlyNoSize(A), ocl::KernelArg::ReadOnlyNoSize(B),
|
k.args(ocl::KernelArg::ReadOnlyNoSize(A), ocl::KernelArg::ReadOnlyNoSize(B),
|
||||||
ocl::KernelArg::WriteOnly(dst));
|
ocl::KernelArg::WriteOnly(dst), rowsPerWI);
|
||||||
|
|
||||||
size_t globalsize[2] = { asize.width, asize.height };
|
size_t globalsize[2] = { asize.width, (asize.height + rowsPerWI - 1) / rowsPerWI };
|
||||||
return k.run(2, globalsize, NULL, false);
|
return k.run(2, globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -2742,7 +2742,8 @@ namespace cv {
|
|||||||
static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
|
static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
|
||||||
{
|
{
|
||||||
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
|
int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
|
||||||
sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn);
|
sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
|
||||||
|
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
|
||||||
|
|
||||||
ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
|
ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
|
||||||
format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
|
format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
|
||||||
@@ -2751,9 +2752,10 @@ static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
UMat m = _m.getUMat();
|
UMat m = _m.getUMat();
|
||||||
k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));
|
k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)),
|
||||||
|
rowsPerWI);
|
||||||
|
|
||||||
size_t globalsize[2] = { m.cols, m.rows };
|
size_t globalsize[2] = { m.cols, (m.rows + rowsPerWI - 1) / rowsPerWI };
|
||||||
return k.run(2, globalsize, NULL, false);
|
return k.run(2, globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -52,37 +52,47 @@
|
|||||||
__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
|
__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||||
#ifdef HAVE_SCALAR
|
#ifdef HAVE_SCALAR
|
||||||
__global const T * src2, __global const T * src3
|
__global const T * src2, __global const T * src3,
|
||||||
#else
|
#else
|
||||||
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
||||||
__global const uchar * src3ptr, int src3_step, int src3_offset
|
__global const uchar * src3ptr, int src3_step, int src3_offset,
|
||||||
#endif
|
#endif
|
||||||
)
|
int rowsPerWI)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < dst_cols && y < dst_rows)
|
if (x < dst_cols)
|
||||||
{
|
{
|
||||||
int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset));
|
int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset));
|
||||||
int dst_index = mad24(y, dst_step, x + dst_offset);
|
int dst_index = mad24(y0, dst_step, x + dst_offset);
|
||||||
__global const T * src1 = (__global const T *)(src1ptr + src1_index);
|
|
||||||
__global uchar * dst = dstptr + dst_index;
|
|
||||||
|
|
||||||
#ifndef HAVE_SCALAR
|
#ifndef HAVE_SCALAR
|
||||||
int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset));
|
int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset));
|
||||||
int src3_index = mad24(y, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset));
|
int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset));
|
||||||
__global const T * src2 = (__global const T *)(src2ptr + src2_index);
|
|
||||||
__global const T * src3 = (__global const T *)(src3ptr + src3_index);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
dst[0] = 255;
|
for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)
|
||||||
|
{
|
||||||
|
__global const T * src1 = (__global const T *)(src1ptr + src1_index);
|
||||||
|
__global uchar * dst = dstptr + dst_index;
|
||||||
|
#ifndef HAVE_SCALAR
|
||||||
|
__global const T * src2 = (__global const T *)(src2ptr + src2_index);
|
||||||
|
__global const T * src3 = (__global const T *)(src3ptr + src3_index);
|
||||||
|
#endif
|
||||||
|
|
||||||
for (int c = 0; c < cn; ++c)
|
dst[0] = 255;
|
||||||
if (src2[c] > src1[c] || src3[c] < src1[c])
|
|
||||||
{
|
for (int c = 0; c < cn; ++c)
|
||||||
dst[0] = 0;
|
if (src2[c] > src1[c] || src3[c] < src1[c])
|
||||||
break;
|
{
|
||||||
}
|
dst[0] = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef HAVE_SCALAR
|
||||||
|
src2_index += src2_step;
|
||||||
|
src3_index += src3_step;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -45,20 +45,28 @@
|
|||||||
__global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset,
|
__global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset,
|
||||||
#define DECLARE_OUTPUT_MAT(i) \
|
#define DECLARE_OUTPUT_MAT(i) \
|
||||||
__global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,
|
__global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,
|
||||||
|
#define DECLARE_INDEX(i) \
|
||||||
|
int src##i##_index = mad24(src##i##_step, y0, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \
|
||||||
|
int dst##i##_index = mad24(dst##i##_step, y0, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset));
|
||||||
#define PROCESS_ELEM(i) \
|
#define PROCESS_ELEM(i) \
|
||||||
int src##i##_index = mad24(src##i##_step, y, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \
|
|
||||||
__global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \
|
__global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \
|
||||||
int dst##i##_index = mad24(dst##i##_step, y, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset)); \
|
|
||||||
__global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \
|
__global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \
|
||||||
dst##i[0] = src##i[0];
|
dst##i[0] = src##i[0]; \
|
||||||
|
src##i##_index += src##i##_step; \
|
||||||
|
dst##i##_index += dst##i##_step;
|
||||||
|
|
||||||
__kernel void mixChannels(DECLARE_INPUT_MATS DECLARE_OUTPUT_MATS int rows, int cols)
|
__kernel void mixChannels(DECLARE_INPUT_MAT_N DECLARE_OUTPUT_MAT_N int rows, int cols, int rowsPerWI)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
if (x < cols)
|
||||||
{
|
{
|
||||||
PROCESS_ELEMS
|
DECLARE_INDEX_N
|
||||||
|
|
||||||
|
for (int y = y0, y1 = min(y0 + rowsPerWI, rows); y < y1; ++y)
|
||||||
|
{
|
||||||
|
PROCESS_ELEM_N
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -56,26 +56,30 @@ inline float2 conjf(float2 a)
|
|||||||
__kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step, int src1_offset,
|
__kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step, int src1_offset,
|
||||||
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset,
|
__global uchar * dstptr, int dst_step, int dst_offset,
|
||||||
int dst_rows, int dst_cols)
|
int dst_rows, int dst_cols, int rowsPerWI)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < dst_cols && y < dst_rows)
|
if (x < dst_cols)
|
||||||
{
|
{
|
||||||
int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(float2), src1_offset));
|
int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(float2), src1_offset));
|
||||||
int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(float2), src2_offset));
|
int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(float2), src2_offset));
|
||||||
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(float2), dst_offset));
|
int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(float2), dst_offset));
|
||||||
|
|
||||||
float2 src0 = *(__global const float2 *)(src1ptr + src1_index);
|
for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y,
|
||||||
float2 src1 = *(__global const float2 *)(src2ptr + src2_index);
|
src1_index += src1_step, src2_index += src2_step, dst_index += dst_step)
|
||||||
__global float2 * dst = (__global float2 *)(dstptr + dst_index);
|
{
|
||||||
|
float2 src0 = *(__global const float2 *)(src1ptr + src1_index);
|
||||||
|
float2 src1 = *(__global const float2 *)(src2ptr + src2_index);
|
||||||
|
__global float2 * dst = (__global float2 *)(dstptr + dst_index);
|
||||||
|
|
||||||
#ifdef CONJ
|
#ifdef CONJ
|
||||||
float2 v = cmulf(src0, conjf(src1));
|
float2 v = cmulf(src0, conjf(src1));
|
||||||
#else
|
#else
|
||||||
float2 v = cmulf(src0, src1);
|
float2 v = cmulf(src0, src1);
|
||||||
#endif
|
#endif
|
||||||
dst[0] = v;
|
dst[0] = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -56,15 +56,16 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||||
ST scalar_)
|
ST scalar_, int rowsPerWI)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
if (x < cols)
|
||||||
{
|
{
|
||||||
int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));
|
int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
|
||||||
|
|
||||||
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
|
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
|
||||||
|
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -44,42 +44,58 @@
|
|||||||
#ifdef OP_MERGE
|
#ifdef OP_MERGE
|
||||||
|
|
||||||
#define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,
|
#define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,
|
||||||
#define DECLARE_DATA(index) __global const T * src##index = \
|
#define DECLARE_INDEX(index) int src##index##_index = mad24(src##index##_step, y0, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset));
|
||||||
(__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset)));
|
#define PROCESS_ELEM(index) \
|
||||||
#define PROCESS_ELEM(index) dst[index] = src##index[0];
|
__global const T * src##index = (__global const T *)(src##index##ptr + src##index##_index); \
|
||||||
|
dst[index] = src##index[0]; \
|
||||||
|
src##index##_index += src##index##_step;
|
||||||
|
|
||||||
__kernel void merge(DECLARE_SRC_PARAMS_N
|
__kernel void merge(DECLARE_SRC_PARAMS_N
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset,
|
__global uchar * dstptr, int dst_step, int dst_offset,
|
||||||
int rows, int cols)
|
int rows, int cols, int rowsPerWI)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
if (x < cols)
|
||||||
{
|
{
|
||||||
DECLARE_DATA_N
|
DECLARE_INDEX_N
|
||||||
__global T * dst = (__global T *)(dstptr + mad24(dst_step, y, mad24(x, (int)sizeof(T) * cn, dst_offset)));
|
int dst_index = mad24(dst_step, y0, mad24(x, (int)sizeof(T) * cn, dst_offset));
|
||||||
PROCESS_ELEMS_N
|
|
||||||
|
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, dst_index += dst_step)
|
||||||
|
{
|
||||||
|
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||||
|
|
||||||
|
PROCESS_ELEMS_N
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined OP_SPLIT
|
#elif defined OP_SPLIT
|
||||||
|
|
||||||
#define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset
|
#define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset
|
||||||
#define DECLARE_DATA(index) __global T * dst##index = \
|
#define DECLARE_INDEX(index) int dst##index##_index = mad24(y0, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset));
|
||||||
(__global T *)(dst##index##ptr + mad24(y, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset)));
|
#define PROCESS_ELEM(index) \
|
||||||
#define PROCESS_ELEM(index) dst##index[0] = src[index];
|
__global T * dst##index = (__global T *)(dst##index##ptr + dst##index##_index); \
|
||||||
|
dst##index[0] = src[index]; \
|
||||||
|
dst##index##_index += dst##index##_step;
|
||||||
|
|
||||||
__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS)
|
__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS, int rowsPerWI)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
if (x < cols)
|
||||||
{
|
{
|
||||||
DECLARE_DATA_N
|
DECLARE_INDEX_N
|
||||||
__global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, cn * (int)sizeof(T), src_offset)));
|
int src_index = mad24(y0, src_step, mad24(x, cn * (int)sizeof(T), src_offset));
|
||||||
PROCESS_ELEMS_N
|
|
||||||
|
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
|
||||||
|
{
|
||||||
|
__global const T * src = (__global const T *)(srcptr + src_index);
|
||||||
|
|
||||||
|
PROCESS_ELEMS_N
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user