optimized cv::repeat
This commit is contained in:
parent
7249622ce7
commit
c83455d8a4
@ -758,16 +758,28 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
|
|||||||
|
|
||||||
static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
|
static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
|
||||||
{
|
{
|
||||||
UMat src = _src.getUMat(), dst = _dst.getUMat();
|
if (ny == 1 && nx == 1)
|
||||||
|
{
|
||||||
|
_src.copyTo(_dst);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
for (int y = 0; y < ny; ++y)
|
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
|
||||||
for (int x = 0; x < nx; ++x)
|
rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1,
|
||||||
{
|
kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
|
||||||
Rect roi(x * src.cols, y * src.rows, src.cols, src.rows);
|
|
||||||
UMat hdr(dst, roi);
|
ocl::Kernel k("repeat", ocl::core::repeat_oclsrc,
|
||||||
src.copyTo(hdr);
|
format("-D T=%s -D nx=%d -D ny=%d -D rowsPerWI=%d -D cn=%d",
|
||||||
}
|
ocl::memopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
|
||||||
return true;
|
nx, ny, rowsPerWI, kercn));
|
||||||
|
if (k.empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
UMat src = _src.getUMat(), dst = _dst.getUMat();
|
||||||
|
k.args(ocl::KernelArg::ReadOnly(src, cn, kercn), ocl::KernelArg::WriteOnlyNoSize(dst));
|
||||||
|
|
||||||
|
size_t globalsize[] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
|
||||||
|
return k.run(2, globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -4406,8 +4406,8 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
|
|||||||
CV_Assert(src.isMat() || src.isUMat()); \
|
CV_Assert(src.isMat() || src.isUMat()); \
|
||||||
int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
|
int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
|
||||||
Size csize = src.size(); \
|
Size csize = src.size(); \
|
||||||
cols.push_back(ccn * src.size().width); \
|
cols.push_back(ccn * csize.width); \
|
||||||
if (ctype != type || csize != ssize) \
|
if (ctype != type) \
|
||||||
return 1; \
|
return 1; \
|
||||||
offsets.push_back(src.offset()); \
|
offsets.push_back(src.offset()); \
|
||||||
steps.push_back(src.step()); \
|
steps.push_back(src.step()); \
|
||||||
|
47
modules/core/src/opencl/repeat.cl
Normal file
47
modules/core/src/opencl/repeat.cl
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
|
||||||
|
// Third party copyrights are property of their respective owners.
|
||||||
|
|
||||||
|
#if cn != 3
|
||||||
|
#define loadpix(addr) *(__global const T *)(addr)
|
||||||
|
#define storepix(val, addr) *(__global T *)(addr) = val
|
||||||
|
#define TSIZE (int)sizeof(T)
|
||||||
|
#else
|
||||||
|
#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
|
||||||
|
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
|
||||||
|
#define TSIZE ((int)sizeof(T1)*3)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void repeat(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||||
|
__global uchar * dstptr, int dst_step, int dst_offset)
|
||||||
|
{
|
||||||
|
int x = get_global_id(0);
|
||||||
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
|
if (x < src_cols)
|
||||||
|
{
|
||||||
|
int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(T), src_offset));
|
||||||
|
int dst_index0 = mad24(y0, dst_step, mad24(x, (int)sizeof(T), dst_offset));
|
||||||
|
|
||||||
|
for (int y = y0, y1 = min(src_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index0 += dst_step)
|
||||||
|
{
|
||||||
|
T srcelem = loadpix(srcptr + src_index);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int ey = 0; ey < ny; ++ey)
|
||||||
|
{
|
||||||
|
int dst_index = mad24(ey * src_rows, dst_step, dst_index0);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int ex = 0; ex < nx; ++ex)
|
||||||
|
{
|
||||||
|
storepix(srcelem, dstptr + dst_index);
|
||||||
|
dst_index = mad24(src_cols, (int)sizeof(T), dst_index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user