align grid by 4
This commit is contained in:
parent
8579666b03
commit
be0c20b758
@ -357,18 +357,25 @@ namespace cv { namespace gpu { namespace device
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
|
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
// each thread handles 16 values, use 1 block/row
|
// each thread handles 16 values, use 1 block/row
|
||||||
const int block = img.cols / 16;
|
int block = img.cols / 16;
|
||||||
|
|
||||||
|
// save, becouse step is actually can't be less 512 bytes
|
||||||
|
int align = img.cols % 4;
|
||||||
|
if ( align != 0)
|
||||||
|
{
|
||||||
|
block += (4 - align);
|
||||||
|
}
|
||||||
|
|
||||||
// launch 1 block / row
|
// launch 1 block / row
|
||||||
const int grid = img.rows;
|
const int grid = img.rows;
|
||||||
|
|
||||||
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
|
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
|
||||||
|
|
||||||
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
|
shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
|
||||||
cudaSafeCall( cudaGetLastError() );
|
cudaSafeCall( cudaGetLastError() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
namespace imgproc
|
namespace imgproc
|
||||||
{
|
{
|
||||||
void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
|
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user