Added workaround for Nvidia: take into account that 3-channel vector type takes 4*elem_size in local memory.
This commit is contained in:
parent
214dab39f6
commit
a48c1c8248
@ -671,8 +671,11 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
|
|||||||
|
|
||||||
size_t wgs = dev.maxWorkGroupSize();
|
size_t wgs = dev.maxWorkGroupSize();
|
||||||
size_t lmsz = dev.localMemSize();
|
size_t lmsz = dev.localMemSize();
|
||||||
|
|
||||||
size_t src_step = _src.step(), src_offset = _src.offset();
|
size_t src_step = _src.step(), src_offset = _src.offset();
|
||||||
|
|
||||||
|
// workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory
|
||||||
|
int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;
|
||||||
|
|
||||||
if (((src_offset % src_step) % esz == 0) &&
|
if (((src_offset % src_step) % esz == 0) &&
|
||||||
(
|
(
|
||||||
(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
|
(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
|
||||||
@ -680,7 +683,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
|
|||||||
(_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols))
|
(_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols))
|
||||||
) &&
|
) &&
|
||||||
(tileSizeX * tileSizeYmin <= wgs) &&
|
(tileSizeX * tileSizeYmin <= wgs) &&
|
||||||
(LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, cn * 4) <= lmsz)
|
(LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz)
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
Size size = _src.size(), wholeSize;
|
Size size = _src.size(), wholeSize;
|
||||||
@ -689,7 +692,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
|
|||||||
int wdepth = CV_32F;
|
int wdepth = CV_32F;
|
||||||
|
|
||||||
size_t tileSizeY = wgs / tileSizeX;
|
size_t tileSizeY = wgs / tileSizeX;
|
||||||
while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, cn * 4) > lmsz))
|
while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz))
|
||||||
{
|
{
|
||||||
tileSizeY /= 2;
|
tileSizeY /= 2;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user