[~] Minor refactoring, clean-up
[+] Added 128-bit transpose
This commit is contained in:
parent
e2caf4a3ed
commit
0c325cace3
@ -63,8 +63,6 @@
|
||||
#include "NCVRuntimeTemplates.hpp"
|
||||
#include "NCVHaarObjectDetection.hpp"
|
||||
|
||||
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
|
||||
|
||||
|
||||
//==============================================================================
|
||||
//
|
||||
@ -785,7 +783,6 @@ void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
|
||||
//Second parameter is the number of "dynamic" template parameters
|
||||
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
|
||||
::call( &functor,
|
||||
0xC001C0DE, //this is dummy int for the va_args C compatibility
|
||||
tbInitMaskPositively,
|
||||
tbCacheTextureIImg,
|
||||
tbCacheTextureCascade,
|
||||
@ -890,7 +887,6 @@ void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg
|
||||
//Second parameter is the number of "dynamic" template parameters
|
||||
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
|
||||
::call( &functor,
|
||||
0xC001C0DE, //this is dummy int for the va_args C compatibility
|
||||
tbCacheTextureIImg,
|
||||
tbCacheTextureCascade,
|
||||
tbDoAtomicCompaction);
|
||||
@ -957,7 +953,6 @@ void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
|
||||
//Second parameter is the number of "dynamic" template parameters
|
||||
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
|
||||
::call( &functor,
|
||||
0xC001C0DE, //this is dummy int for the va_args C compatibility
|
||||
tbMaskByInmask,
|
||||
tbDoAtomicCompaction);
|
||||
}
|
||||
@ -1554,172 +1549,6 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
|
||||
}
|
||||
|
||||
|
||||
//==============================================================================
|
||||
//
|
||||
// Visualize file
|
||||
//
|
||||
//==============================================================================
|
||||
|
||||
|
||||
const Ncv32u NUMTHREADS_DRAWRECTS = 32;
|
||||
const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
|
||||
|
||||
|
||||
template <class T>
|
||||
__global__ void drawRects(T *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
T color)
|
||||
{
|
||||
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
|
||||
if (blockId > numRects * 4)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
NcvRect32u curRect = d_rects[blockId >> 2];
|
||||
NcvBool bVertical = blockId & 0x1;
|
||||
NcvBool bTopLeft = blockId & 0x2;
|
||||
|
||||
Ncv32u pt0x, pt0y;
|
||||
if (bVertical)
|
||||
{
|
||||
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
|
||||
|
||||
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
|
||||
pt0y = curRect.y;
|
||||
|
||||
if (pt0x < dstWidth)
|
||||
{
|
||||
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
|
||||
{
|
||||
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
|
||||
if (ptY < pt0y + curRect.height && ptY < dstHeight)
|
||||
{
|
||||
d_dst[ptY * dstStride + pt0x] = color;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
|
||||
|
||||
pt0x = curRect.x;
|
||||
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
|
||||
|
||||
if (pt0y < dstHeight)
|
||||
{
|
||||
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
|
||||
{
|
||||
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
|
||||
if (ptX < pt0x + curRect.width && ptX < dstWidth)
|
||||
{
|
||||
d_dst[pt0y * dstStride + ptX] = color;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
static NCVStatus drawRectsWrapperDevice(T *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
T color,
|
||||
cudaStream_t cuStream)
|
||||
{
|
||||
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
|
||||
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
|
||||
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
|
||||
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
|
||||
|
||||
if (numRects == 0)
|
||||
{
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
#if defined _SELF_TEST_
|
||||
T *h_dst;
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
|
||||
NcvRect32s *h_rects;
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
|
||||
ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color));
|
||||
#endif
|
||||
|
||||
dim3 grid(numRects * 4);
|
||||
dim3 block(NUMTHREADS_DRAWRECTS);
|
||||
if (grid.x > 65535)
|
||||
{
|
||||
grid.y = (grid.x + 65534) / 65535;
|
||||
grid.x = 65535;
|
||||
}
|
||||
|
||||
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
|
||||
|
||||
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
|
||||
|
||||
#if defined _SELF_TEST_
|
||||
T *h_dst_after;
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
|
||||
bool bPass = true;
|
||||
for (Ncv32u i=0; i<dstHeight && bPass; i++)
|
||||
{
|
||||
for (Ncv32u j=0; j<dstWidth && bPass; j++)
|
||||
{
|
||||
if (h_dst[i*dstStride+j] != h_dst_after[i*dstStride+j])
|
||||
{
|
||||
printf("::drawRectsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_dst[i*dstStride+j], h_dst_after[i*dstStride+j]);
|
||||
bPass = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_dst_after), NCV_CUDA_ERROR);
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NCV_CUDA_ERROR);
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_rects), NCV_CUDA_ERROR);
|
||||
printf("::drawRectsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
|
||||
#endif
|
||||
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv8u color,
|
||||
cudaStream_t cuStream)
|
||||
{
|
||||
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
|
||||
}
|
||||
|
||||
|
||||
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv32u color,
|
||||
cudaStream_t cuStream)
|
||||
{
|
||||
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
|
||||
}
|
||||
|
||||
|
||||
//==============================================================================
|
||||
//
|
||||
// Pipeline file
|
||||
@ -1901,13 +1730,13 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
|
||||
|
||||
NCV_SKIP_COND_BEGIN
|
||||
|
||||
nppStat = nppiStDownsampleNearest_32u_C1R(
|
||||
nppStat = nppiStDecimate_32u_C1R(
|
||||
d_integralImage.ptr(), d_integralImage.pitch(),
|
||||
d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
|
||||
srcIIRoi, scale, true);
|
||||
ncvAssertReturnNcvStat(nppStat);
|
||||
|
||||
nppStat = nppiStDownsampleNearest_64u_C1R(
|
||||
nppStat = nppiStDecimate_64u_C1R(
|
||||
d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
|
||||
d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
|
||||
srcIIRoi, scale, true);
|
||||
@ -1969,7 +1798,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
|
||||
}
|
||||
|
||||
Ncv32u numStrongHypothesesNow = dstNumRects;
|
||||
ncvStat = ncvFilterHypotheses_host(
|
||||
ncvStat = ncvGroupRectangles_host(
|
||||
h_hypothesesIntermediate,
|
||||
numStrongHypothesesNow,
|
||||
minNeighbors,
|
||||
@ -2031,7 +1860,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
|
||||
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
|
||||
}
|
||||
|
||||
ncvStat = ncvFilterHypotheses_host(
|
||||
ncvStat = ncvGroupRectangles_host(
|
||||
h_hypothesesIntermediate,
|
||||
dstNumRects,
|
||||
minNeighbors,
|
||||
@ -2285,133 +2114,6 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
|
||||
Ncv32u &numHypotheses,
|
||||
Ncv32u minNeighbors,
|
||||
Ncv32f intersectEps,
|
||||
NCVVector<Ncv32u> *hypothesesWeights)
|
||||
{
|
||||
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
|
||||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
|
||||
if (hypothesesWeights != NULL)
|
||||
{
|
||||
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
|
||||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
|
||||
}
|
||||
|
||||
if (numHypotheses == 0)
|
||||
{
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
std::vector<NcvRect32u> rects(numHypotheses);
|
||||
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
|
||||
|
||||
std::vector<Ncv32u> weights;
|
||||
if (hypothesesWeights != NULL)
|
||||
{
|
||||
groupRectangles(rects, minNeighbors, intersectEps, &weights);
|
||||
}
|
||||
else
|
||||
{
|
||||
groupRectangles(rects, minNeighbors, intersectEps, NULL);
|
||||
}
|
||||
|
||||
numHypotheses = (Ncv32u)rects.size();
|
||||
if (numHypotheses > 0)
|
||||
{
|
||||
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
|
||||
}
|
||||
|
||||
if (hypothesesWeights != NULL)
|
||||
{
|
||||
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
|
||||
}
|
||||
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
static NCVStatus drawRectsWrapperHost(T *h_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects,
|
||||
Ncv32u numRects,
|
||||
T color)
|
||||
{
|
||||
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
|
||||
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
|
||||
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
|
||||
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
|
||||
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
|
||||
|
||||
for (Ncv32u i=0; i<numRects; i++)
|
||||
{
|
||||
NcvRect32u rect = h_rects[i];
|
||||
|
||||
if (rect.x < dstWidth)
|
||||
{
|
||||
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
|
||||
{
|
||||
h_dst[i*dstStride+rect.x] = color;
|
||||
}
|
||||
}
|
||||
if (rect.x+rect.width-1 < dstWidth)
|
||||
{
|
||||
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
|
||||
{
|
||||
h_dst[i*dstStride+rect.x+rect.width-1] = color;
|
||||
}
|
||||
}
|
||||
if (rect.y < dstHeight)
|
||||
{
|
||||
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
|
||||
{
|
||||
h_dst[rect.y*dstStride+j] = color;
|
||||
}
|
||||
}
|
||||
if (rect.y + rect.height - 1 < dstHeight)
|
||||
{
|
||||
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
|
||||
{
|
||||
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv8u color)
|
||||
{
|
||||
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
|
||||
}
|
||||
|
||||
|
||||
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv32u color)
|
||||
{
|
||||
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
|
||||
}
|
||||
|
||||
|
||||
NCVStatus loadFromXML(const std::string &filename,
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
std::vector<HaarStage64> &haarStages,
|
||||
|
@ -346,153 +346,107 @@ enum
|
||||
NCVPipeObjDet_VisualizeInPlace = 0x004,
|
||||
};
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
|
||||
NcvSize32u srcRoi,
|
||||
NCVVector<NcvRect32u> &d_dstRects,
|
||||
Ncv32u &dstNumRects,
|
||||
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarStage64> &d_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &d_HaarNodes,
|
||||
NCVVector<HaarFeature64> &d_HaarFeatures,
|
||||
|
||||
NcvSize32u minObjSize,
|
||||
Ncv32u minNeighbors, //default 4
|
||||
Ncv32f scaleStep, //default 1.2f
|
||||
Ncv32u pixelStep, //default 1
|
||||
Ncv32u flags, //default NCVPipeObjDet_Default
|
||||
|
||||
INCVMemAllocator &gpuAllocator,
|
||||
INCVMemAllocator &cpuAllocator,
|
||||
cudaDeviceProp &devProp,
|
||||
cudaStream_t cuStream);
|
||||
NCV_EXPORTS NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
|
||||
NcvSize32u srcRoi,
|
||||
NCVVector<NcvRect32u> &d_dstRects,
|
||||
Ncv32u &dstNumRects,
|
||||
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarStage64> &d_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &d_HaarNodes,
|
||||
NCVVector<HaarFeature64> &d_HaarFeatures,
|
||||
|
||||
NcvSize32u minObjSize,
|
||||
Ncv32u minNeighbors, //default 4
|
||||
Ncv32f scaleStep, //default 1.2f
|
||||
Ncv32u pixelStep, //default 1
|
||||
Ncv32u flags, //default NCVPipeObjDet_Default
|
||||
|
||||
INCVMemAllocator &gpuAllocator,
|
||||
INCVMemAllocator &cpuAllocator,
|
||||
cudaDeviceProp &devProp,
|
||||
cudaStream_t cuStream);
|
||||
|
||||
|
||||
#define OBJDET_MASK_ELEMENT_INVALID_32U 0xFFFFFFFF
|
||||
#define HAAR_STDDEV_BORDER 1
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
|
||||
NCVMatrix<Ncv32f> &d_weights,
|
||||
NCVMatrixAlloc<Ncv32u> &d_pixelMask,
|
||||
Ncv32u &numDetections,
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarStage64> &d_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &d_HaarNodes,
|
||||
NCVVector<HaarFeature64> &d_HaarFeatures,
|
||||
NcvBool bMaskElements,
|
||||
NcvSize32u anchorsRoi,
|
||||
Ncv32u pixelStep,
|
||||
Ncv32f scaleArea,
|
||||
INCVMemAllocator &gpuAllocator,
|
||||
INCVMemAllocator &cpuAllocator,
|
||||
cudaDeviceProp &devProp,
|
||||
cudaStream_t cuStream);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
|
||||
NCVMatrix<Ncv32f> &h_weights,
|
||||
NCVMatrixAlloc<Ncv32u> &h_pixelMask,
|
||||
Ncv32u &numDetections,
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &h_HaarNodes,
|
||||
NCVVector<HaarFeature64> &h_HaarFeatures,
|
||||
NcvBool bMaskElements,
|
||||
NcvSize32u anchorsRoi,
|
||||
Ncv32u pixelStep,
|
||||
Ncv32f scaleArea);
|
||||
NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
|
||||
NCVMatrix<Ncv32f> &d_weights,
|
||||
NCVMatrixAlloc<Ncv32u> &d_pixelMask,
|
||||
Ncv32u &numDetections,
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarStage64> &d_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &d_HaarNodes,
|
||||
NCVVector<HaarFeature64> &d_HaarFeatures,
|
||||
NcvBool bMaskElements,
|
||||
NcvSize32u anchorsRoi,
|
||||
Ncv32u pixelStep,
|
||||
Ncv32f scaleArea,
|
||||
INCVMemAllocator &gpuAllocator,
|
||||
INCVMemAllocator &cpuAllocator,
|
||||
cudaDeviceProp &devProp,
|
||||
cudaStream_t cuStream);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv8u color,
|
||||
cudaStream_t cuStream);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv32u color,
|
||||
cudaStream_t cuStream);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv8u color);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv32u color);
|
||||
NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
|
||||
NCVMatrix<Ncv32f> &h_weights,
|
||||
NCVMatrixAlloc<Ncv32u> &h_pixelMask,
|
||||
Ncv32u &numDetections,
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &h_HaarNodes,
|
||||
NCVVector<HaarFeature64> &h_HaarFeatures,
|
||||
NcvBool bMaskElements,
|
||||
NcvSize32u anchorsRoi,
|
||||
Ncv32u pixelStep,
|
||||
Ncv32f scaleArea);
|
||||
|
||||
|
||||
#define RECT_SIMILARITY_PROPORTION 0.2f
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
|
||||
Ncv32u numPixelMaskDetections,
|
||||
NCVVector<NcvRect32u> &hypotheses,
|
||||
Ncv32u &totalDetections,
|
||||
Ncv32u totalMaxDetections,
|
||||
Ncv32u rectWidth,
|
||||
Ncv32u rectHeight,
|
||||
Ncv32f curScale,
|
||||
cudaStream_t cuStream);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
|
||||
Ncv32u numPixelMaskDetections,
|
||||
NCVVector<NcvRect32u> &hypotheses,
|
||||
Ncv32u &totalDetections,
|
||||
Ncv32u totalMaxDetections,
|
||||
Ncv32u rectWidth,
|
||||
Ncv32u rectHeight,
|
||||
Ncv32f curScale);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
|
||||
Ncv32u &numHypotheses,
|
||||
Ncv32u minNeighbors,
|
||||
Ncv32f intersectEps,
|
||||
NCVVector<Ncv32u> *hypothesesWeights);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
|
||||
Ncv32u &numNodes, Ncv32u &numFeatures);
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &h_HaarNodes,
|
||||
NCVVector<HaarFeature64> &h_HaarFeatures);
|
||||
NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
|
||||
Ncv32u numPixelMaskDetections,
|
||||
NCVVector<NcvRect32u> &hypotheses,
|
||||
Ncv32u &totalDetections,
|
||||
Ncv32u totalMaxDetections,
|
||||
Ncv32u rectWidth,
|
||||
Ncv32u rectHeight,
|
||||
Ncv32f curScale,
|
||||
cudaStream_t cuStream);
|
||||
|
||||
|
||||
NCV_EXPORTS
|
||||
NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
|
||||
HaarClassifierCascadeDescriptor haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &h_HaarNodes,
|
||||
NCVVector<HaarFeature64> &h_HaarFeatures);
|
||||
NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
|
||||
Ncv32u numPixelMaskDetections,
|
||||
NCVVector<NcvRect32u> &hypotheses,
|
||||
Ncv32u &totalDetections,
|
||||
Ncv32u totalMaxDetections,
|
||||
Ncv32u rectWidth,
|
||||
Ncv32u rectHeight,
|
||||
Ncv32f curScale);
|
||||
|
||||
|
||||
NCV_EXPORTS NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
|
||||
Ncv32u &numNodes, Ncv32u &numFeatures);
|
||||
|
||||
|
||||
NCV_EXPORTS NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
|
||||
HaarClassifierCascadeDescriptor &haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &h_HaarNodes,
|
||||
NCVVector<HaarFeature64> &h_HaarFeatures);
|
||||
|
||||
|
||||
NCV_EXPORTS NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
|
||||
HaarClassifierCascadeDescriptor haar,
|
||||
NCVVector<HaarStage64> &h_HaarStages,
|
||||
NCVVector<HaarClassifierNode128> &h_HaarNodes,
|
||||
NCVVector<HaarFeature64> &h_HaarFeatures);
|
||||
|
||||
|
||||
|
||||
|
@ -44,10 +44,6 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include "NPP_staging.hpp"
|
||||
|
||||
#if defined _SELF_TEST_
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
|
||||
texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
|
||||
texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
|
||||
@ -161,12 +157,6 @@ const Ncv32u NUM_SCAN_THREADS = 256;
|
||||
const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
|
||||
|
||||
|
||||
struct T_true {};
|
||||
struct T_false {};
|
||||
template <typename T, typename U> struct is_same : T_false {};
|
||||
template <typename T> struct is_same<T, T> : T_true {};
|
||||
|
||||
|
||||
template<class T_in, class T_out>
|
||||
struct _scanElemOp
|
||||
{
|
||||
@ -175,13 +165,16 @@ struct _scanElemOp
|
||||
{
|
||||
return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
template <int v> struct Int2Type { enum { value = v }; };
|
||||
|
||||
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
|
||||
{
|
||||
return (T_out)elem;
|
||||
}
|
||||
|
||||
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
|
||||
{
|
||||
return (T_out)(elem*elem);
|
||||
@ -190,25 +183,25 @@ private:
|
||||
|
||||
|
||||
template<class T>
|
||||
inline __device__ T readElem(T *d_src, Ncv32u srcStride, Ncv32u curElemOffs);
|
||||
inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
|
||||
|
||||
|
||||
template<>
|
||||
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
|
||||
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
|
||||
{
|
||||
return tex1Dfetch(tex8u, srcStride * blockIdx.x + curElemOffs);
|
||||
return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
|
||||
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
|
||||
{
|
||||
return d_src[curElemOffs];
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
|
||||
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
|
||||
{
|
||||
return d_src[curElemOffs];
|
||||
}
|
||||
@ -233,7 +226,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32
|
||||
* \return None
|
||||
*/
|
||||
template <class T_in, class T_out, bool tbDoSqr>
|
||||
__global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
|
||||
__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
|
||||
T_out *d_II, Ncv32u IIstride)
|
||||
{
|
||||
//advance pointers to the current line
|
||||
@ -263,7 +256,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
|
||||
if (curElemOffs < srcWidth)
|
||||
{
|
||||
//load elements
|
||||
curElem = readElem<T_in>(d_src, srcStride, curElemOffs);
|
||||
curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
|
||||
}
|
||||
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
|
||||
|
||||
@ -298,55 +291,28 @@ NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
|
||||
T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
|
||||
{
|
||||
cudaChannelFormatDesc cfdTex;
|
||||
size_t alignmentOffset = 0;
|
||||
if (sizeof(T_in) == 1)
|
||||
{
|
||||
cfdTex = cudaCreateChannelDesc<Ncv8u>();
|
||||
size_t alignmentOffset;
|
||||
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
|
||||
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
|
||||
if (alignmentOffset > 0)
|
||||
{
|
||||
ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
|
||||
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
|
||||
}
|
||||
}
|
||||
scanRows
|
||||
<T_in, T_out, tbDoSqr>
|
||||
<<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
|
||||
(d_src, roi.width, srcStride, d_dst, dstStride);
|
||||
(d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
|
||||
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
|
||||
|
||||
#if defined _SELF_TEST_
|
||||
T_in *h_src;
|
||||
T_out *h_dst;
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * roi.height * sizeof(T_in)), NPPST_MEM_ALLOC_ERR);
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * roi.height * sizeof(T_out)), NPPST_MEM_ALLOC_ERR);
|
||||
memset(h_src, 0, srcStride * roi.height * sizeof(T_in));
|
||||
memset(h_dst, 0, dstStride * roi.height * sizeof(T_out));
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * roi.height * sizeof(T_in), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * roi.height * sizeof(T_out), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
NcvBool bPass = true;
|
||||
for (Ncv32u i=0; i<roi.height && bPass; i++)
|
||||
{
|
||||
T_out curElem = 0;
|
||||
for (Ncv32u j=0; j<roi.width+1 && bPass; j++)
|
||||
{
|
||||
if (curElem != h_dst[i * dstStride + j])
|
||||
{
|
||||
printf("CIntegralImage::scanRowsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, curElem, h_dst[i * dstStride + j]);
|
||||
bPass = false;
|
||||
}
|
||||
if (j < roi.width)
|
||||
{
|
||||
curElem += scanElemOp<T_op>(h_src[i*srcStride+j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
|
||||
printf("CIntegralImage::scanRowsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
|
||||
#endif
|
||||
|
||||
return NPPST_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
|
||||
static Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
|
||||
{
|
||||
Ncv32u alignMask = allocatorAlignment-1;
|
||||
Ncv32u inverseAlignMask = ~alignMask;
|
||||
@ -676,7 +642,7 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
|
||||
|
||||
//==============================================================================
|
||||
//
|
||||
// DownsampleNearest.cu
|
||||
// Decimate.cu
|
||||
//
|
||||
//==============================================================================
|
||||
|
||||
@ -686,25 +652,25 @@ const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
|
||||
|
||||
|
||||
template<class T, NcvBool tbCacheTexture>
|
||||
__device__ T getElem_DownsampleNearest(Ncv32u x, T *d_src);
|
||||
__device__ T getElem_Decimate(Ncv32u x, T *d_src);
|
||||
|
||||
|
||||
template<>
|
||||
__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
|
||||
__device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
|
||||
{
|
||||
return tex1Dfetch(tex32u, x);
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
|
||||
__device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
|
||||
{
|
||||
return d_src[x];
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
|
||||
__device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
|
||||
{
|
||||
uint2 tmp = tex1Dfetch(tex64u, x);
|
||||
Ncv64u res = (Ncv64u)tmp.y;
|
||||
@ -715,14 +681,14 @@ __device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_sr
|
||||
|
||||
|
||||
template<>
|
||||
__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
|
||||
__device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
|
||||
{
|
||||
return d_src[x];
|
||||
}
|
||||
|
||||
|
||||
template <class T, NcvBool tbCacheTexture>
|
||||
__global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
|
||||
__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u dstRoi, Ncv32u scale)
|
||||
{
|
||||
int curX = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@ -733,12 +699,12 @@ __global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u
|
||||
return;
|
||||
}
|
||||
|
||||
d_dst[curY * dstStep + curX] = getElem_DownsampleNearest<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
|
||||
d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
|
||||
static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
|
||||
T *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture)
|
||||
@ -761,7 +727,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
|
||||
|
||||
if (!readThruTexture)
|
||||
{
|
||||
downsampleNearest_C1R
|
||||
decimate_C1R
|
||||
<T, false>
|
||||
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
|
||||
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
|
||||
@ -787,7 +753,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
|
||||
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
|
||||
}
|
||||
|
||||
downsampleNearest_C1R
|
||||
decimate_C1R
|
||||
<T, true>
|
||||
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
|
||||
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
|
||||
@ -795,39 +761,12 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
|
||||
|
||||
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
|
||||
|
||||
#if defined _SELF_TEST_
|
||||
T *h_src;
|
||||
T *h_dst;
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStep * srcRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStep * dstRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStep * srcRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStep * dstRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
|
||||
bool bPass = true;
|
||||
|
||||
for (Ncv32u i=0; i<dstRoi.height && bPass; i++)
|
||||
{
|
||||
for (Ncv32u j=0; j<dstRoi.width && bPass; j++)
|
||||
{
|
||||
if (h_dst[i*dstStep+j] != h_src[i*scale*srcStep + j*scale])
|
||||
{
|
||||
printf("::downsampleNearestWrapperDevice self test failed: i=%d, j=%d, cpu=%ld, gpu=%ld\n", i, j, (long long)h_src[i*scale*srcStep + j*scale], (long long)h_dst[i*dstStep+j]);
|
||||
bPass = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
|
||||
printf("::downsampleNearestWrapperDevice %s\n", bPass?"PASSED":"FAILED");
|
||||
#endif
|
||||
|
||||
return NPPST_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
|
||||
static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
|
||||
T *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale)
|
||||
{
|
||||
@ -856,40 +795,40 @@ static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
|
||||
}
|
||||
|
||||
|
||||
#define implementNppDownsampleNearest(bit, typ) \
|
||||
NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
|
||||
#define implementNppDecimate(bit, typ) \
|
||||
NCVStatus nppiStDecimate_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
|
||||
Ncv##bit##typ *d_dst, Ncv32u dstStep, \
|
||||
NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
|
||||
{ \
|
||||
return downsampleNearestWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
|
||||
return decimateWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
|
||||
(Ncv##bit##u *)d_dst, dstStep, \
|
||||
srcRoi, scale, readThruTexture); \
|
||||
}
|
||||
|
||||
|
||||
#define implementNppDownsampleNearestHost(bit, typ) \
|
||||
NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
|
||||
#define implementNppDecimateHost(bit, typ) \
|
||||
NCVStatus nppiStDecimate_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
|
||||
Ncv##bit##typ *h_dst, Ncv32u dstStep, \
|
||||
NcvSize32u srcRoi, Ncv32u scale) \
|
||||
{ \
|
||||
return downsampleNearestWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
|
||||
return decimateWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
|
||||
(Ncv##bit##u *)h_dst, dstStep, \
|
||||
srcRoi, scale); \
|
||||
}
|
||||
|
||||
|
||||
implementNppDownsampleNearest(32, u)
|
||||
implementNppDownsampleNearest(32, s)
|
||||
implementNppDownsampleNearest(32, f)
|
||||
implementNppDownsampleNearest(64, u)
|
||||
implementNppDownsampleNearest(64, s)
|
||||
implementNppDownsampleNearest(64, f)
|
||||
implementNppDownsampleNearestHost(32, u)
|
||||
implementNppDownsampleNearestHost(32, s)
|
||||
implementNppDownsampleNearestHost(32, f)
|
||||
implementNppDownsampleNearestHost(64, u)
|
||||
implementNppDownsampleNearestHost(64, s)
|
||||
implementNppDownsampleNearestHost(64, f)
|
||||
implementNppDecimate(32, u)
|
||||
implementNppDecimate(32, s)
|
||||
implementNppDecimate(32, f)
|
||||
implementNppDecimate(64, u)
|
||||
implementNppDecimate(64, s)
|
||||
implementNppDecimate(64, f)
|
||||
implementNppDecimateHost(32, u)
|
||||
implementNppDecimateHost(32, s)
|
||||
implementNppDecimateHost(32, f)
|
||||
implementNppDecimateHost(64, u)
|
||||
implementNppDecimateHost(64, s)
|
||||
implementNppDecimateHost(64, f)
|
||||
|
||||
|
||||
//==============================================================================
|
||||
@ -1051,46 +990,6 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
|
||||
|
||||
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
|
||||
|
||||
#if defined _SELF_TEST_
|
||||
Ncv32u *h_sum;
|
||||
Ncv64u *h_sqsum;
|
||||
Ncv32f *h_norm_d;
|
||||
Ncv32u ExtHeight = roi.height + rect.y + rect.height;
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_sum, sumStep * ExtHeight * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u)), NPPST_MEM_ALLOC_ERR);
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_norm_d, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_sum, d_sum, sumStep * ExtHeight * sizeof(Ncv32u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_sqsum, d_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_norm_d, d_norm, normStep * roi.height * sizeof(Ncv32f), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
|
||||
Ncv32f *h_norm_h;
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_norm_h, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
|
||||
|
||||
ncvAssertReturnNcvStat(nppRectStdDev_32f_C1R_host(h_sum, sqsumStep, h_sqsum, sqsumStep, h_norm_h, normStep, roi, rect, scaleArea));
|
||||
|
||||
const Ncv64f relEPS = 0.005;
|
||||
bool bPass = true;
|
||||
for (Ncv32u i=0; i<roi.height && bPass; i++)
|
||||
{
|
||||
for (Ncv32u j=0; j<roi.width && bPass; j++)
|
||||
{
|
||||
Ncv64f absErr = fabs(h_norm_h[i * normStep + j] - h_norm_d[i * normStep + j]);
|
||||
Ncv64f relErr = absErr / h_norm_h[i * normStep + j];
|
||||
|
||||
if (relErr > relEPS)
|
||||
{
|
||||
printf("::ncvRectStdDev_32f_C1R self test failed: i=%d, j=%d, cpu=%f, gpu=%f\n", i, j, h_norm_h[i * normStep + j], h_norm_d[i * normStep + j]);
|
||||
bPass = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_sum), NPPST_MEMFREE_ERR);
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_sqsum), NPPST_MEMFREE_ERR);
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_norm_d), NPPST_MEMFREE_ERR);
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_norm_h), NPPST_MEMFREE_ERR);
|
||||
printf("::ncvRectStdDev_32f_C1R %s\n", bPass?"PASSED":"FAILED");
|
||||
#endif
|
||||
|
||||
return NPPST_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1251,34 +1150,6 @@ NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
|
||||
(d_src, srcStride, d_dst, dstStride, srcRoi);
|
||||
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
|
||||
|
||||
#if defined _SELF_TEST_
|
||||
Ncv32u widthExt = grid.x * TRANSPOSE_TILE_DIM;
|
||||
Ncv32u heightExt = grid.y * TRANSPOSE_TILE_DIM;
|
||||
T *h_src;
|
||||
T *h_dst;
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * heightExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
|
||||
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * widthExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
|
||||
memset(h_src, 0, srcStride * heightExt * sizeof(T));
|
||||
memset(h_dst, 0, dstStride * widthExt * sizeof(T));
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * heightExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * widthExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
|
||||
NcvBool bPass = true;
|
||||
for (Ncv32u i=0; i<srcRoi.height && bPass; i++)
|
||||
{
|
||||
for (Ncv32u j=0; j<srcRoi.width && bPass; j++)
|
||||
{
|
||||
if (h_src[i * srcStride + j] != h_dst[j * dstStride + i])
|
||||
{
|
||||
printf("CIntegralImage::transposeWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_src[j * srcStride + i], h_dst[i * dstStride + j]);
|
||||
bPass = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
|
||||
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
|
||||
printf("CIntegralImage::transposeWrapperDevice %s\n", bPass?"PASSED":"FAILED");
|
||||
#endif
|
||||
|
||||
return NPPST_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1341,6 +1212,20 @@ implementNppTransposeHost(64,s)
|
||||
implementNppTransposeHost(64,f)
|
||||
|
||||
|
||||
NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
|
||||
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
|
||||
{
|
||||
return transposeWrapperDevice<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
|
||||
}
|
||||
|
||||
|
||||
NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
|
||||
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
|
||||
{
|
||||
return transposeWrapperHost<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
|
||||
}
|
||||
|
||||
|
||||
//==============================================================================
|
||||
//
|
||||
// Compact.cu
|
||||
|
@ -96,65 +96,65 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
|
||||
* \return NCV status code
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
|
||||
Ncv32u *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
NCVStatus nppiStDecimate_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
|
||||
Ncv32u *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
|
||||
* \see nppiStDownsampleNearest_32u_C1R
|
||||
* \see nppiStDecimate_32u_C1R
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
|
||||
Ncv32s *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
NCVStatus nppiStDecimate_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
|
||||
Ncv32s *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
|
||||
* \see nppiStDownsampleNearest_32u_C1R
|
||||
* \see nppiStDecimate_32u_C1R
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
|
||||
Ncv32f *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
NCVStatus nppiStDecimate_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
|
||||
Ncv32f *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
|
||||
* \see nppiStDownsampleNearest_32u_C1R
|
||||
* \see nppiStDecimate_32u_C1R
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
|
||||
Ncv64u *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
NCVStatus nppiStDecimate_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
|
||||
Ncv64u *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
|
||||
* \see nppiStDownsampleNearest_32u_C1R
|
||||
* \see nppiStDecimate_32u_C1R
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
|
||||
Ncv64s *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
NCVStatus nppiStDecimate_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
|
||||
Ncv64s *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
|
||||
* \see nppiStDownsampleNearest_32u_C1R
|
||||
* \see nppiStDecimate_32u_C1R
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
|
||||
Ncv64f *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
NCVStatus nppiStDecimate_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
|
||||
Ncv64f *d_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale,
|
||||
NcvBool readThruTexture);
|
||||
|
||||
|
||||
/**
|
||||
@ -170,59 +170,59 @@ NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
|
||||
* \return NCV status code
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
|
||||
Ncv32u *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
NCVStatus nppiStDecimate_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
|
||||
Ncv32u *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
|
||||
* \see nppiStDownsampleNearest_32u_C1R_host
|
||||
* \see nppiStDecimate_32u_C1R_host
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
|
||||
Ncv32s *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
NCVStatus nppiStDecimate_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
|
||||
Ncv32s *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
|
||||
* \see nppiStDownsampleNearest_32u_C1R_host
|
||||
* \see nppiStDecimate_32u_C1R_host
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
|
||||
Ncv32f *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
NCVStatus nppiStDecimate_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
|
||||
Ncv32f *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
|
||||
* \see nppiStDownsampleNearest_32u_C1R_host
|
||||
* \see nppiStDecimate_32u_C1R_host
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
|
||||
Ncv64u *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
NCVStatus nppiStDecimate_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
|
||||
Ncv64u *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
|
||||
* \see nppiStDownsampleNearest_32u_C1R_host
|
||||
* \see nppiStDecimate_32u_C1R_host
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
|
||||
Ncv64s *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
NCVStatus nppiStDecimate_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
|
||||
Ncv64s *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
|
||||
|
||||
/**
|
||||
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
|
||||
* \see nppiStDownsampleNearest_32u_C1R_host
|
||||
* \see nppiStDecimate_32u_C1R_host
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStDownsampleNearest_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
|
||||
Ncv64f *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
NCVStatus nppiStDecimate_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
|
||||
Ncv64f *h_dst, Ncv32u dstStep,
|
||||
NcvSize32u srcRoi, Ncv32u scale);
|
||||
|
||||
|
||||
/**
|
||||
@ -333,6 +333,15 @@ NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
|
||||
Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
|
||||
|
||||
|
||||
/**
|
||||
* Transposes an image. 128-bit pixels of any type, single channel
|
||||
* \see nppiStTranspose_32u_C1R
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
|
||||
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
|
||||
|
||||
|
||||
/**
|
||||
* Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
|
||||
*
|
||||
@ -394,6 +403,15 @@ NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
|
||||
Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
|
||||
|
||||
|
||||
/**
|
||||
* Transposes an image. 128-bit pixels of any type, single channel. Host implementation
|
||||
* \see nppiStTranspose_32u_C1R_host
|
||||
*/
|
||||
NCV_EXPORTS
|
||||
NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
|
||||
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the size of the temporary buffer for integral image creation
|
||||
*
|
||||
|
@ -40,14 +40,9 @@
|
||||
//M*/
|
||||
|
||||
|
||||
#if !defined (HAVE_CUDA)
|
||||
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
|
||||
#include <ios>
|
||||
#include <stdarg.h>
|
||||
#include <vector>
|
||||
#include "NCV.hpp"
|
||||
|
||||
|
||||
@ -182,6 +177,78 @@ NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NC
|
||||
}
|
||||
|
||||
|
||||
NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
|
||||
const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
|
||||
Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream)
|
||||
{
|
||||
NCVStatus ncvStat;
|
||||
switch (dstType)
|
||||
{
|
||||
case NCVMemoryTypeHostPageable:
|
||||
case NCVMemoryTypeHostPinned:
|
||||
switch (srcType)
|
||||
{
|
||||
case NCVMemoryTypeHostPageable:
|
||||
case NCVMemoryTypeHostPinned:
|
||||
for (Ncv32u i=0; i<height; i++)
|
||||
{
|
||||
memcpy((char*)dst + i * dstPitch, (char*)src + i * srcPitch, widthbytes);
|
||||
}
|
||||
ncvStat = NCV_SUCCESS;
|
||||
break;
|
||||
case NCVMemoryTypeDevice:
|
||||
if (cuStream != 0)
|
||||
{
|
||||
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
|
||||
}
|
||||
else
|
||||
{
|
||||
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
|
||||
}
|
||||
ncvStat = NCV_SUCCESS;
|
||||
break;
|
||||
default:
|
||||
ncvStat = NCV_MEM_RESIDENCE_ERROR;
|
||||
}
|
||||
break;
|
||||
case NCVMemoryTypeDevice:
|
||||
switch (srcType)
|
||||
{
|
||||
case NCVMemoryTypeHostPageable:
|
||||
case NCVMemoryTypeHostPinned:
|
||||
if (cuStream != 0)
|
||||
{
|
||||
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
|
||||
}
|
||||
else
|
||||
{
|
||||
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
|
||||
}
|
||||
ncvStat = NCV_SUCCESS;
|
||||
break;
|
||||
case NCVMemoryTypeDevice:
|
||||
if (cuStream != 0)
|
||||
{
|
||||
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
|
||||
}
|
||||
else
|
||||
{
|
||||
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
|
||||
}
|
||||
ncvStat = NCV_SUCCESS;
|
||||
break;
|
||||
default:
|
||||
ncvStat = NCV_MEM_RESIDENCE_ERROR;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
ncvStat = NCV_MEM_RESIDENCE_ERROR;
|
||||
}
|
||||
|
||||
return ncvStat;
|
||||
}
|
||||
|
||||
|
||||
//===================================================================
|
||||
//
|
||||
// NCVMemStackAllocator class members implementation
|
||||
@ -195,8 +262,10 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
|
||||
_maxSize(0),
|
||||
allocBegin(NULL),
|
||||
begin(NULL),
|
||||
end(NULL),
|
||||
_memType(NCVMemoryTypeNone),
|
||||
_alignment(alignment)
|
||||
_alignment(alignment),
|
||||
bReusesMemory(false)
|
||||
{
|
||||
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
|
||||
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
|
||||
@ -573,4 +642,264 @@ double ncvEndQueryTimerMs(NcvTimer t)
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif /* !defined (HAVE_CUDA) */
|
||||
|
||||
//===================================================================
|
||||
//
|
||||
// Operations with rectangles
|
||||
//
|
||||
//===================================================================
|
||||
|
||||
|
||||
//from OpenCV
|
||||
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
|
||||
|
||||
|
||||
NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses,
|
||||
Ncv32u &numHypotheses,
|
||||
Ncv32u minNeighbors,
|
||||
Ncv32f intersectEps,
|
||||
NCVVector<Ncv32u> *hypothesesWeights)
|
||||
{
|
||||
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
|
||||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
|
||||
if (hypothesesWeights != NULL)
|
||||
{
|
||||
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
|
||||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
|
||||
}
|
||||
|
||||
if (numHypotheses == 0)
|
||||
{
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
std::vector<NcvRect32u> rects(numHypotheses);
|
||||
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
|
||||
|
||||
std::vector<Ncv32u> weights;
|
||||
if (hypothesesWeights != NULL)
|
||||
{
|
||||
groupRectangles(rects, minNeighbors, intersectEps, &weights);
|
||||
}
|
||||
else
|
||||
{
|
||||
groupRectangles(rects, minNeighbors, intersectEps, NULL);
|
||||
}
|
||||
|
||||
numHypotheses = (Ncv32u)rects.size();
|
||||
if (numHypotheses > 0)
|
||||
{
|
||||
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
|
||||
}
|
||||
|
||||
if (hypothesesWeights != NULL)
|
||||
{
|
||||
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
|
||||
}
|
||||
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
static NCVStatus drawRectsWrapperHost(T *h_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects,
|
||||
Ncv32u numRects,
|
||||
T color)
|
||||
{
|
||||
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
|
||||
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
|
||||
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
|
||||
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
|
||||
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
|
||||
|
||||
for (Ncv32u i=0; i<numRects; i++)
|
||||
{
|
||||
NcvRect32u rect = h_rects[i];
|
||||
|
||||
if (rect.x < dstWidth)
|
||||
{
|
||||
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
|
||||
{
|
||||
h_dst[i*dstStride+rect.x] = color;
|
||||
}
|
||||
}
|
||||
if (rect.x+rect.width-1 < dstWidth)
|
||||
{
|
||||
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
|
||||
{
|
||||
h_dst[i*dstStride+rect.x+rect.width-1] = color;
|
||||
}
|
||||
}
|
||||
if (rect.y < dstHeight)
|
||||
{
|
||||
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
|
||||
{
|
||||
h_dst[rect.y*dstStride+j] = color;
|
||||
}
|
||||
}
|
||||
if (rect.y + rect.height - 1 < dstHeight)
|
||||
{
|
||||
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
|
||||
{
|
||||
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv8u color)
|
||||
{
|
||||
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
|
||||
}
|
||||
|
||||
|
||||
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv32u color)
|
||||
{
|
||||
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
|
||||
}
|
||||
|
||||
|
||||
const Ncv32u NUMTHREADS_DRAWRECTS = 32;
|
||||
const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
|
||||
|
||||
|
||||
template <class T>
|
||||
__global__ void drawRects(T *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
T color)
|
||||
{
|
||||
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
|
||||
if (blockId > numRects * 4)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
NcvRect32u curRect = d_rects[blockId >> 2];
|
||||
NcvBool bVertical = blockId & 0x1;
|
||||
NcvBool bTopLeft = blockId & 0x2;
|
||||
|
||||
Ncv32u pt0x, pt0y;
|
||||
if (bVertical)
|
||||
{
|
||||
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
|
||||
|
||||
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
|
||||
pt0y = curRect.y;
|
||||
|
||||
if (pt0x < dstWidth)
|
||||
{
|
||||
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
|
||||
{
|
||||
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
|
||||
if (ptY < pt0y + curRect.height && ptY < dstHeight)
|
||||
{
|
||||
d_dst[ptY * dstStride + pt0x] = color;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
|
||||
|
||||
pt0x = curRect.x;
|
||||
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
|
||||
|
||||
if (pt0y < dstHeight)
|
||||
{
|
||||
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
|
||||
{
|
||||
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
|
||||
if (ptX < pt0x + curRect.width && ptX < dstWidth)
|
||||
{
|
||||
d_dst[pt0y * dstStride + ptX] = color;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
static NCVStatus drawRectsWrapperDevice(T *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
T color,
|
||||
cudaStream_t cuStream)
|
||||
{
|
||||
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
|
||||
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
|
||||
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
|
||||
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
|
||||
|
||||
if (numRects == 0)
|
||||
{
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
dim3 grid(numRects * 4);
|
||||
dim3 block(NUMTHREADS_DRAWRECTS);
|
||||
if (grid.x > 65535)
|
||||
{
|
||||
grid.y = (grid.x + 65534) / 65535;
|
||||
grid.x = 65535;
|
||||
}
|
||||
|
||||
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
|
||||
|
||||
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
|
||||
|
||||
return NCV_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv8u color,
|
||||
cudaStream_t cuStream)
|
||||
{
|
||||
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
|
||||
}
|
||||
|
||||
|
||||
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
|
||||
Ncv32u dstStride,
|
||||
Ncv32u dstWidth,
|
||||
Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects,
|
||||
Ncv32u numRects,
|
||||
Ncv32u color,
|
||||
cudaStream_t cuStream)
|
||||
{
|
||||
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
|
||||
}
|
@ -129,8 +129,8 @@ struct NcvRect8u
|
||||
Ncv8u y;
|
||||
Ncv8u width;
|
||||
Ncv8u height;
|
||||
NcvRect8u() : x(0), y(0), width(0), height(0) {};
|
||||
NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
|
||||
__host__ __device__ NcvRect8u() : x(0), y(0), width(0), height(0) {};
|
||||
__host__ __device__ NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
|
||||
};
|
||||
|
||||
|
||||
@ -140,8 +140,8 @@ struct NcvRect32s
|
||||
Ncv32s y; ///< y-coordinate of upper left corner.
|
||||
Ncv32s width; ///< Rectangle width.
|
||||
Ncv32s height; ///< Rectangle height.
|
||||
NcvRect32s() : x(0), y(0), width(0), height(0) {};
|
||||
NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
|
||||
__host__ __device__ NcvRect32s() : x(0), y(0), width(0), height(0) {};
|
||||
__host__ __device__ NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
|
||||
};
|
||||
|
||||
|
||||
@ -151,8 +151,8 @@ struct NcvRect32u
|
||||
Ncv32u y; ///< y-coordinate of upper left corner.
|
||||
Ncv32u width; ///< Rectangle width.
|
||||
Ncv32u height; ///< Rectangle height.
|
||||
NcvRect32u() : x(0), y(0), width(0), height(0) {};
|
||||
NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
|
||||
__host__ __device__ NcvRect32u() : x(0), y(0), width(0), height(0) {};
|
||||
__host__ __device__ NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
|
||||
};
|
||||
|
||||
|
||||
@ -160,8 +160,8 @@ struct NcvSize32s
|
||||
{
|
||||
Ncv32s width; ///< Rectangle width.
|
||||
Ncv32s height; ///< Rectangle height.
|
||||
NcvSize32s() : width(0), height(0) {};
|
||||
NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
|
||||
__host__ __device__ NcvSize32s() : width(0), height(0) {};
|
||||
__host__ __device__ NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
|
||||
};
|
||||
|
||||
|
||||
@ -169,8 +169,8 @@ struct NcvSize32u
|
||||
{
|
||||
Ncv32u width; ///< Rectangle width.
|
||||
Ncv32u height; ///< Rectangle height.
|
||||
NcvSize32u() : width(0), height(0) {};
|
||||
NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
|
||||
__host__ __device__ NcvSize32u() : width(0), height(0) {};
|
||||
__host__ __device__ NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
|
||||
};
|
||||
|
||||
|
||||
@ -275,6 +275,7 @@ enum NCVStatus
|
||||
{
|
||||
//NCV statuses
|
||||
NCV_SUCCESS,
|
||||
NCV_UNKNOWN_ERROR,
|
||||
|
||||
NCV_CUDA_ERROR,
|
||||
NCV_NPP_ERROR,
|
||||
@ -501,13 +502,18 @@ private:
|
||||
|
||||
|
||||
/**
|
||||
* Copy dispatcher
|
||||
* Copy dispatchers
|
||||
*/
|
||||
NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
|
||||
const void *src, NCVMemoryType srcType,
|
||||
size_t sz, cudaStream_t cuStream);
|
||||
|
||||
|
||||
NCV_EXPORTS NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
|
||||
const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
|
||||
Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream);
|
||||
|
||||
|
||||
/**
|
||||
* NCVVector (1D)
|
||||
*/
|
||||
@ -532,7 +538,7 @@ public:
|
||||
_memtype = NCVMemoryTypeNone;
|
||||
}
|
||||
|
||||
NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
|
||||
NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
|
||||
{
|
||||
if (howMuch == 0)
|
||||
{
|
||||
@ -600,7 +606,6 @@ public:
|
||||
this->_memtype = this->allocatedMem.begin.memtype;
|
||||
}
|
||||
|
||||
|
||||
~NCVVectorAlloc()
|
||||
{
|
||||
NCVStatus ncvStat;
|
||||
@ -611,25 +616,22 @@ public:
|
||||
this->clear();
|
||||
}
|
||||
|
||||
|
||||
NcvBool isMemAllocated() const
|
||||
{
|
||||
return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
|
||||
}
|
||||
|
||||
|
||||
Ncv32u getAllocatorsAlignment() const
|
||||
{
|
||||
return allocator.alignment();
|
||||
}
|
||||
|
||||
|
||||
NCVMemSegment getSegment() const
|
||||
{
|
||||
return allocatedMem;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
INCVMemAllocator &allocator;
|
||||
NCVMemSegment allocatedMem;
|
||||
};
|
||||
@ -658,7 +660,6 @@ public:
|
||||
this->bReused = true;
|
||||
}
|
||||
|
||||
|
||||
NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)
|
||||
{
|
||||
this->bReused = false;
|
||||
@ -674,7 +675,6 @@ public:
|
||||
this->bReused = true;
|
||||
}
|
||||
|
||||
|
||||
NcvBool isMemReused() const
|
||||
{
|
||||
return this->bReused;
|
||||
@ -703,7 +703,6 @@ public:
|
||||
|
||||
virtual ~NCVMatrix() {}
|
||||
|
||||
|
||||
void clear()
|
||||
{
|
||||
_ptr = NULL;
|
||||
@ -713,14 +712,13 @@ public:
|
||||
_memtype = NCVMemoryTypeNone;
|
||||
}
|
||||
|
||||
|
||||
Ncv32u stride() const
|
||||
{
|
||||
return _pitch / sizeof(T);
|
||||
}
|
||||
|
||||
|
||||
NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
|
||||
//a side effect of this function is that it copies everything in a single chunk, so the "padding" will be overwritten
|
||||
NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
|
||||
{
|
||||
if (howMuch == 0)
|
||||
{
|
||||
@ -748,6 +746,24 @@ public:
|
||||
return ncvStat;
|
||||
}
|
||||
|
||||
NCVStatus copy2D(NCVMatrix<T> &dst, NcvSize32u roi, cudaStream_t cuStream) const
|
||||
{
|
||||
ncvAssertReturn(this->width() >= roi.width && this->height() >= roi.height &&
|
||||
dst.width() >= roi.width && dst.height() >= roi.height, NCV_MEM_COPY_ERROR);
|
||||
ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) &&
|
||||
(dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
|
||||
|
||||
NCVStatus ncvStat = NCV_SUCCESS;
|
||||
if (this->_memtype != NCVMemoryTypeNone)
|
||||
{
|
||||
ncvStat = memSegCopyHelper2D(dst._ptr, dst._pitch, dst._memtype,
|
||||
this->_ptr, this->_pitch, this->_memtype,
|
||||
roi.width * sizeof(T), roi.height, cuStream);
|
||||
}
|
||||
|
||||
return ncvStat;
|
||||
}
|
||||
|
||||
T *ptr() const {return this->_ptr;}
|
||||
Ncv32u width() const {return this->_width;}
|
||||
Ncv32u height() const {return this->_height;}
|
||||
@ -817,19 +833,16 @@ public:
|
||||
this->clear();
|
||||
}
|
||||
|
||||
|
||||
NcvBool isMemAllocated() const
|
||||
{
|
||||
return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
|
||||
}
|
||||
|
||||
|
||||
Ncv32u getAllocatorsAlignment() const
|
||||
{
|
||||
return allocator.alignment();
|
||||
}
|
||||
|
||||
|
||||
NCVMemSegment getSegment() const
|
||||
{
|
||||
return allocatedMem;
|
||||
@ -888,6 +901,23 @@ public:
|
||||
this->bReused = true;
|
||||
}
|
||||
|
||||
NCVMatrixReuse(const NCVMatrix<T> &mat, NcvRect32u roi)
|
||||
{
|
||||
this->bReused = false;
|
||||
this->clear();
|
||||
|
||||
ncvAssertPrintReturn(roi.x < mat.width() && roi.y < mat.height() && \
|
||||
roi.x + roi.width <= mat.width() && roi.y + roi.height <= mat.height(),
|
||||
"NCVMatrixReuse ctor:: memory binding failed due to mismatching ROI and source matrix dims", );
|
||||
|
||||
this->_width = roi.width;
|
||||
this->_height = roi.height;
|
||||
this->_pitch = mat.pitch();
|
||||
this->_ptr = mat.ptr() + roi.y * mat.stride() + roi.x;
|
||||
this->_memtype = mat.memType();
|
||||
|
||||
this->bReused = true;
|
||||
}
|
||||
|
||||
NcvBool isMemReused() const
|
||||
{
|
||||
@ -899,4 +929,27 @@ private:
|
||||
NcvBool bReused;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Operations with rectangles
|
||||
*/
|
||||
NCV_EXPORTS NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses, Ncv32u &numHypotheses,
|
||||
Ncv32u minNeighbors, Ncv32f intersectEps, NCVVector<Ncv32u> *hypothesesWeights);
|
||||
|
||||
|
||||
NCV_EXPORTS NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects, Ncv32u numRects, Ncv8u color);
|
||||
|
||||
|
||||
NCV_EXPORTS NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
|
||||
NcvRect32u *h_rects, Ncv32u numRects, Ncv32u color);
|
||||
|
||||
|
||||
NCV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects, Ncv32u numRects, Ncv8u color, cudaStream_t cuStream);
|
||||
|
||||
|
||||
NCV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
|
||||
NcvRect32u *d_rects, Ncv32u numRects, Ncv32u color, cudaStream_t cuStream);
|
||||
|
||||
#endif // _ncv_hpp_
|
||||
|
@ -150,14 +150,14 @@ namespace NCVRuntimeTemplateBool
|
||||
{
|
||||
//Convenience function used by the user
|
||||
//Takes a variable argument list, transforms it into a list
|
||||
static void call(Func *functor, int dummy, ...)
|
||||
static void call(Func *functor, ...)
|
||||
{
|
||||
//Vector used to collect arguments
|
||||
std::vector<int> templateParamList;
|
||||
|
||||
//Variable argument list manipulation
|
||||
va_list listPointer;
|
||||
va_start(listPointer, dummy);
|
||||
va_start(listPointer, functor);
|
||||
//Collect parameters into the list
|
||||
for(int i=0; i<NumArguments; i++)
|
||||
{
|
||||
|
@ -134,7 +134,7 @@ bool TestHypothesesFilter::process()
|
||||
|
||||
Ncv32u numHypothesesSrc = h_vecSrc.length();
|
||||
NCV_SKIP_COND_BEGIN
|
||||
ncvStat = ncvFilterHypotheses_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
|
||||
ncvStat = ncvGroupRectangles_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
|
||||
ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
|
||||
NCV_SKIP_COND_END
|
||||
|
||||
|
@ -83,17 +83,17 @@ bool TestResize<T>::process()
|
||||
NCV_SKIP_COND_BEGIN
|
||||
if (sizeof(T) == sizeof(Ncv32u))
|
||||
{
|
||||
ncvStat = nppiStDownsampleNearest_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
|
||||
(Ncv32u *)d_small.ptr(), d_small.pitch(),
|
||||
srcSize, this->scaleFactor,
|
||||
this->bTextureCache);
|
||||
ncvStat = nppiStDecimate_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
|
||||
(Ncv32u *)d_small.ptr(), d_small.pitch(),
|
||||
srcSize, this->scaleFactor,
|
||||
this->bTextureCache);
|
||||
}
|
||||
else if (sizeof(T) == sizeof(Ncv64u))
|
||||
{
|
||||
ncvStat = nppiStDownsampleNearest_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
|
||||
(Ncv64u *)d_small.ptr(), d_small.pitch(),
|
||||
srcSize, this->scaleFactor,
|
||||
this->bTextureCache);
|
||||
ncvStat = nppiStDecimate_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
|
||||
(Ncv64u *)d_small.ptr(), d_small.pitch(),
|
||||
srcSize, this->scaleFactor,
|
||||
this->bTextureCache);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -107,15 +107,15 @@ bool TestResize<T>::process()
|
||||
NCV_SKIP_COND_BEGIN
|
||||
if (sizeof(T) == sizeof(Ncv32u))
|
||||
{
|
||||
ncvStat = nppiStDownsampleNearest_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
|
||||
(Ncv32u *)h_small.ptr(), h_small.pitch(),
|
||||
srcSize, this->scaleFactor);
|
||||
ncvStat = nppiStDecimate_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
|
||||
(Ncv32u *)h_small.ptr(), h_small.pitch(),
|
||||
srcSize, this->scaleFactor);
|
||||
}
|
||||
else if (sizeof(T) == sizeof(Ncv64u))
|
||||
{
|
||||
ncvStat = nppiStDownsampleNearest_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
|
||||
(Ncv64u *)h_small.ptr(), h_small.pitch(),
|
||||
srcSize, this->scaleFactor);
|
||||
ncvStat = nppiStDecimate_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
|
||||
(Ncv64u *)h_small.ptr(), h_small.pitch(),
|
||||
srcSize, this->scaleFactor);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user