diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 0de2d8e37..ac4b8f0e8 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -161,192 +161,128 @@ namespace icf { } texture troi; -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 - template - __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, - const int downscales) + +template +template +__device void CascadeInvoker::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const +{ + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x; + + // load Lavel + __shared__ Level level; + + // check POI + __shared__ volatile char roiCache[Policy::STA_Y]; + + if (!threadIdx.y && !threadIdx.x) + ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x); + + __syncthreads(); + + if (!roiCache[threadIdx.y]) return; + + if (!threadIdx.x) + level = levels[downscales + blockIdx.z]; + + if(x >= level.workRect.x || y >= level.workRect.y) return; + + int st = level.octave * level.step; + const int stEnd = st + level.step; + + float confidence = 0.f; + for(; st < stEnd; st += Policy::WARP) { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x; + const int nId = (st + threadIdx.x) * 3; - // load Lavel - __shared__ Level level; + Node node = nodes[nId]; - // check POI - __shared__ volatile char roiCache[8]; - if (!threadIdx.y && !threadIdx.x) - ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x); + float threshold = rescale(level, node); + int sum = get(x, y + (node.threshold >> 28) * 120, node.rect); - __syncthreads(); + int next = 1 + (int)(sum >= threshold); - if (!roiCache[threadIdx.y]) return; + node = nodes[nId + next]; + threshold = rescale(level, node); + sum = get(x, y + (node.threshold >> 28) * 120, node.rect); - if (!threadIdx.x) - level = levels[downscales + blockIdx.z]; + const int lShift = (next - 1) * 2 + (int)(sum >= threshold); + float impact = leaves[(st + threadIdx.x) * 4 + lShift]; - if(x >= level.workRect.x || y >= level.workRect.y) return; - - Octave octave = octaves[level.octave]; - int st = octave.index * octave.stages; - const int stEnd = st + 1024; - - float confidence = 0.f; - - for(; st < stEnd; st += 32) - { - - const int nId = (st + threadIdx.x) * 3; - dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId); - Node node = nodes[nId]; - - float threshold = rescale(level, node); - int sum = get(x, y + (node.threshold >> 28) * 120, node.rect); - - int next = 1 + (int)(sum >= threshold); - dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold); - - node = nodes[nId + next]; - threshold = rescale(level, node); - sum = get(x, y + (node.threshold >> 28) * 120, node.rect); - - const int lShift = (next - 1) * 2 + (int)(sum >= threshold); - float impact = leaves[(st + threadIdx.x) * 4 + lShift]; - - dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact); - dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]); - dprintf("%d: computed score: %f\n",threadIdx.x, impact); #pragma unroll - // scan on shuffl functions - for (int i = 1; i < 32; i *= 2) - { - const float n = __shfl_up(impact, i, 32); - - if (threadIdx.x >= i) - impact += n; - } - - dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact); - - confidence += impact; - if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; - } - - if(!threadIdx.x && st == stEnd && ((confidence - FLT_EPSILON) >= 0)) + // scan on shuffl functions + for (int i = 1; i < Policy::WARP; i *= 2) { - int idx = atomicInc(ctr, ndetections); - // store detection - objects[idx] = Detection(__float2int_rn(x * octave.shrinkage), - __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); + const float n = __shfl_up(impact, i, Policy::WARP); + + if (threadIdx.x >= i) + impact += n; } + + confidence += impact; + if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; } -#else - template - __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, - const int downscales) + + if(!threadIdx.x && st == stEnd && ((confidence - FLT_EPSILON) >= 0)) { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x * blockDim.x + threadIdx.x; - Level level = levels[blockIdx.z]; - - // if (blockIdx.z != 31) return; - if(x >= level.workRect.x || y >= level.workRect.y) return; - - // int roi = tex2D(troi, x, y); - // printf("%d\n", roi); - // if (!roi) return; - - Octave octave = octaves[level.octave]; - - int st = octave.index * octave.stages; - const int stEnd = st + 1000;//octave.stages; - - float confidence = 0.f; - - for(; st < stEnd; ++st) - { - dprintf("\n\nstage: %d\n", st); - const int nId = st * 3; - Node node = nodes[nId]; - - dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, - node.threshold >> 28, node.threshold & 0x0FFFFFFFU); - - float threshold = rescale(level, node); - int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); - - dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, - node.rect.w, threshold); - - int next = 1 + (int)(sum >= threshold); - dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); - - node = nodes[nId + next]; - threshold = rescale(level, node); - sum = get(x, y + (node.threshold >> 28) * 121, node.rect); - - const int lShift = (next - 1) * 2 + (int)(sum >= threshold); - float impact = leaves[st * 4 + lShift]; - confidence += impact; - - if (confidence <= stages[st]) st = stEnd + 10; - dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); - dprintf("extracted stage: %f\n", stages[st]); - dprintf("computed score: %f\n\n", confidence); - } - - if(st == stEnd) - { - int idx = atomicInc(ctr, ndetections); - // store detection - objects[idx] = Detection(__float2int_rn(x * octave.shrinkage), - __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); - } - } -#endif - - template<> - void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const - { - int fw = 160; - int fh = 120; - - dim3 block(32, 8); - dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1); - - uint* ctr = (uint*)(counter.ptr(0)); - Detection* det = (Detection*)objects.ptr(); - uint max_det = objects.cols / sizeof(Detection); - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - - cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); - - if (scale == -1) - { - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0); - cudaSafeCall( cudaGetLastError()); - - grid = dim3(fw, fh / 8, 47 - downscales); - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales); - } - else - { - if (scale >= downscales) - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); - else - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); - } - - if (!stream) - { - cudaSafeCall( cudaGetLastError()); - cudaSafeCall( cudaDeviceSynchronize()); - } + int idx = atomicInc(ctr, ndetections); + objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE), + __float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence); } } + +template +__global__ void soft_cascade(const CascadeInvoker invoker, Detection* objects, const uint n, uint* ctr, const int downs) +{ + invoker.template detect(objects, n, ctr, downs); +} + +template +void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, + PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const +{ + int fw = 160; + int fh = 120; + + dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1); + + uint* ctr = (uint*)(counter.ptr(0)); + Detection* det = (Detection*)objects.ptr(); + uint max_det = objects.cols / sizeof(Detection); + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); + + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); + + const CascadeInvoker inv = *this; + + if (scale == -1) + { + soft_cascade<<>>(inv, det, max_det, ctr, 0); + cudaSafeCall( cudaGetLastError()); + + grid = dim3(fw, fh / Policy::STA_Y, scales - downscales); + soft_cascade<<>>(inv, det, max_det, ctr, downscales); + } + else + { + if (scale >= downscales) + soft_cascade<<>>(inv, det, max_det, ctr, scale); + else + soft_cascade<<>>(inv, det, max_det, ctr, scale); + } + + if (!stream) + { + cudaSafeCall( cudaGetLastError()); + cudaSafeCall( cudaDeviceSynchronize()); + } +} + +template void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, + PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const; + +} }}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 60df55882..8eb080e23 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -72,9 +72,9 @@ struct __align__(16) Octave struct __align__(8) Level //is actually 24 bytes { int octave; + int step; float relScale; - float shrScale; // used for marking detection float scaling[2]; // calculated according to Dollal paper // for 640x480 we can not get overflow @@ -115,31 +115,41 @@ struct __align__(16) Detection : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {}; }; -struct CascadePolicy +struct GK107PolicyX4 { - enum {STA_X = 32, STA_Y = 8}; + enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4}; + static const dim3 block() + { + return dim3(GK107PolicyX4::STA_X, GK107PolicyX4::STA_Y); + } }; template struct CascadeInvoker { - CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {} + CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {} + CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages, const PtrStepSzb& _nodes, const PtrStepSzf& _leaves) - : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()), - nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()) + : levels((const Level*)_levels.ptr()), + stages((const float*)_stages.ptr()), + nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()), + scales(_levels.cols / sizeof(Level)) {} const Level* levels; - const Octave* octaves; - const float* stages; const Node* nodes; const float* leaves; + int scales; + void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz objects, PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const; + + template + __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const; }; } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index a69be9239..6133bd1cb 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -63,7 +63,7 @@ void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); } #include cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h) -: octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) +: octave(idx), step(oct.stages), relScale(scale / oct.scale) { workRect.x = round(w / (float)oct.shrinkage); workRect.y = round(h / (float)oct.shrinkage); @@ -100,7 +100,7 @@ namespace imgproc { struct cv::gpu::SCascade::Fields { - static Fields* parseCascade(const FileNode &root, const float mins, const float maxs) + static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals) { static const char *const SC_STAGE_TYPE = "stageType"; static const char *const SC_BOOST = "BOOST"; @@ -119,11 +119,8 @@ struct cv::gpu::SCascade::Fields static const char *const SC_ORIG_W = "width"; static const char *const SC_ORIG_H = "height"; - int origWidth = (int)root[SC_ORIG_W]; - CV_Assert(origWidth == ORIG_OBJECT_WIDTH); - + int origWidth = (int)root[SC_ORIG_W]; int origHeight = (int)root[SC_ORIG_H]; - CV_Assert(origHeight == ORIG_OBJECT_HEIGHT); static const char *const SC_OCTAVES = "octaves"; static const char *const SC_STAGES = "stages"; @@ -142,7 +139,6 @@ struct cv::gpu::SCascade::Fields static const char * const SC_F_CHANNEL = "channel"; static const char * const SC_F_RECT = "rect"; - FileNode fn = root[SC_OCTAVES]; if (fn.empty()) return false; @@ -167,8 +163,8 @@ struct cv::gpu::SCascade::Fields ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); ushort2 size; - size.x = cvRound(ORIG_OBJECT_WIDTH * scale); - size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); + size.x = cvRound(origWidth * scale); + size.y = cvRound(origHeight * scale); shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); Octave octave(octIndex, nstages, shrinkage, size, scale); @@ -245,11 +241,11 @@ struct cv::gpu::SCascade::Fields CV_Assert(!hleaves.empty()); std::vector vlevels; - float logFactor = (::log(maxs) - ::log(mins)) / (TOTAL_SCALES -1); + float logFactor = (::log(maxs) - ::log(mins)) / (totals -1); float scale = mins; int downscales = 0; - for (int sc = 0; sc < TOTAL_SCALES; ++sc) + for (int sc = 0; sc < totals; ++sc) { int width = ::std::max(0.0f, FRAME_WIDTH - (origWidth * scale)); int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale)); @@ -302,7 +298,7 @@ struct cv::gpu::SCascade::Fields leaves.upload(hleaves); levels.upload(hlevels); - invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); + invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); } @@ -456,16 +452,13 @@ public: GpuMat sobelBuf; - device::icf::CascadeInvoker invoker; + device::icf::CascadeInvoker invoker; enum { BOOST = 0 }; enum { FRAME_WIDTH = 640, FRAME_HEIGHT = 480, - TOTAL_SCALES = 55, - ORIG_OBJECT_WIDTH = 64, - ORIG_OBJECT_HEIGHT = 128, HOG_BINS = 6, LUV_BINS = 3, HOG_LUV_BINS = 10 @@ -480,21 +473,19 @@ cv::gpu::SCascade::~SCascade() { delete fields; } bool cv::gpu::SCascade::load(const FileNode& fn) { if (fields) delete fields; - fields = Fields::parseCascade(fn, minScale, maxScale); + fields = Fields::parseCascade(fn, minScale, maxScale, scales); return fields != 0; } void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const { + CV_Assert(fields); + const GpuMat colored = image.getGpuMat(); // only color images are supperted CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); - - // we guess user knows about shrincage - // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); - Fields& flds = *fields; if (colored.type() == CV_8UC3) @@ -518,15 +509,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const { + CV_Assert(fields); + const GpuMat colored = image.getGpuMat(); // only color images are supperted CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); - // we guess user knows about shrincage - // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); - Fields& flds = *fields; - if (colored.type() == CV_8UC3) { // only this window size allowed @@ -549,6 +538,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const { + CV_Assert(fields); + const GpuMat roi = _roi.getGpuMat(); _mask.create( roi.cols / 4, roi.rows / 4, roi.type() ); GpuMat mask = _mask.getGpuMat(); diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index cfae940c7..7034b33b9 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -172,7 +172,7 @@ GPU_TEST_P(SCascadeTestRoi, detect, sub.setTo(1); cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1); } - + objectBoxes.setTo(0); cascade.genRoi(rois, trois); cascade.detect(colored, trois, objectBoxes); @@ -222,7 +222,7 @@ GPU_TEST_P(SCascadeTestLevel, detect, cv::gpu::GpuMat trois; cascade.genRoi(rois, trois); - + objectBoxes.setTo(0); int level = GET_PARAM(3); cascade.detect(colored, trois, objectBoxes, level); @@ -281,7 +281,7 @@ GPU_TEST_P(SCascadeTestAll, detect, cv::gpu::GpuMat trois; cascade.genRoi(rois, trois); - + objectBoxes.setTo(0); cascade.detect(colored, trois, objectBoxes); typedef cv::gpu::SCascade::Detection Detection; @@ -321,7 +321,7 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral, cv::gpu::GpuMat trois; cascade.genRoi(rois, trois); - + objectBoxes.setTo(0); cascade.detect(hogluv, trois, objectBoxes); typedef cv::gpu::SCascade::Detection Detection; @@ -357,7 +357,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream, cv::gpu::GpuMat trois; cascade.genRoi(rois, trois, s); - + objectBoxes.setTo(0); cascade.detect(colored, trois, objectBoxes, s); cudaDeviceSynchronize();