first version of soft cascade on GPU
This commit is contained in:
		@@ -42,11 +42,48 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <icf.hpp>
 | 
					#include <icf.hpp>
 | 
				
			||||||
#include <opencv2/gpu/device/saturate_cast.hpp>
 | 
					#include <opencv2/gpu/device/saturate_cast.hpp>
 | 
				
			||||||
 | 
					#include <stdio.h>
 | 
				
			||||||
 | 
					#include <float.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace cv { namespace gpu {
 | 
					namespace cv { namespace gpu { namespace device {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace icf {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 namespace device {
 | 
					    enum {
 | 
				
			||||||
 | 
					        HOG_BINS = 6,
 | 
				
			||||||
 | 
					        HOG_LUV_BINS = 10,
 | 
				
			||||||
 | 
					        WIDTH = 640,
 | 
				
			||||||
 | 
					        HEIGHT = 480,
 | 
				
			||||||
 | 
					        GREY_OFFSET = HEIGHT * HOG_LUV_BINS
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    __global__ void magToHist(const uchar* __restrict__ mag,
 | 
				
			||||||
 | 
					                              const float* __restrict__ angle, const int angPitch,
 | 
				
			||||||
 | 
					                                    uchar* __restrict__ hog,   const int hogPitch)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        const int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
				
			||||||
 | 
					        const int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        const int bin = (int)(angle[y * angPitch + x]);
 | 
				
			||||||
 | 
					        const uchar val = mag[y * angPitch + x];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        hog[((HEIGHT * bin) + y) * hogPitch + x] = val;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS);
 | 
				
			||||||
 | 
					        uchar* hog = (uchar*)hogluv.ptr();
 | 
				
			||||||
 | 
					        const float* angle = (const float*)nangle.ptr();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        dim3 block(32, 8);
 | 
				
			||||||
 | 
					        dim3 grid(WIDTH / 32, HEIGHT / 8);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step);
 | 
				
			||||||
 | 
					        cudaSafeCall( cudaGetLastError() );
 | 
				
			||||||
 | 
					        cudaSafeCall( cudaDeviceSynchronize() );
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
enum {
 | 
					enum {
 | 
				
			||||||
    HOG_BINS = 6,
 | 
					    HOG_BINS = 6,
 | 
				
			||||||
@@ -185,65 +222,175 @@ __global__ void intCol(ushort* __restrict__ sum, const int pitch)
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch)
 | 
					__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch,
 | 
				
			||||||
 | 
					    PtrStepSz<uchar4> objects)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    cascade.detectAt();
 | 
					    cascade.detectAt(hogluv, pitch, objects);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __device icf::Cascade::detectAt() const
 | 
					float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
 | 
				
			||||||
 | 
					                                     const int channel, const float threshold) const
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
					    float relScale = level.relScale;
 | 
				
			||||||
 | 
					    float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // rescale
 | 
				
			||||||
 | 
					    scaledRect.x = __float2int_rn(relScale * scaledRect.x);
 | 
				
			||||||
 | 
					    scaledRect.y = __float2int_rn(relScale * scaledRect.y);
 | 
				
			||||||
 | 
					    scaledRect.z = __float2int_rn(relScale * scaledRect.z);
 | 
				
			||||||
 | 
					    scaledRect.w = __float2int_rn(relScale * scaledRect.w);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    float approx = 1.f;
 | 
				
			||||||
 | 
					    if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        const float expected_new_area = farea * relScale * relScale;
 | 
				
			||||||
 | 
					        approx = expected_new_area / sarea;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // compensation areas rounding
 | 
				
			||||||
 | 
					    float rootThreshold = threshold / approx;
 | 
				
			||||||
 | 
					    rootThreshold *= level.scaling[(int)(channel > 6)];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return rootThreshold;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const
 | 
					typedef unsigned char uchar;
 | 
				
			||||||
 | 
					float __device get(const uchar* __restrict__ hogluv, const int pitch,
 | 
				
			||||||
 | 
					                   const int x, const int y, int channel, uchar4 area)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    const uchar* curr = hogluv + ((channel * 121) + y) * pitch;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int a = curr[area.y * pitch + x + area.x];
 | 
				
			||||||
 | 
					    int b = curr[area.y * pitch + x + area.z];
 | 
				
			||||||
 | 
					    int c = curr[area.w * pitch + x + area.z];
 | 
				
			||||||
 | 
					    int d = curr[area.w * pitch + x + area.x];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return (a - b + c - d);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int pitch,
 | 
				
			||||||
 | 
					                                    PtrStepSz<uchar4>& objects) const
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    const icf::Level* lls = (const icf::Level*)levels.ptr();
 | 
				
			||||||
 | 
					    Level level = lls[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
				
			||||||
 | 
					    const int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (x >= level.workRect.x || y >= level.workRect.y) return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
 | 
				
			||||||
 | 
					    const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    float detectionScore = 0.f;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int st = stBegin;
 | 
				
			||||||
 | 
					    for(; st < stEnd; ++st)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        const float stage = stages(0, st);
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            const int nId = st * 3;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            // work with root node
 | 
				
			||||||
 | 
					            const Node node = ((const Node*)nodes.ptr())[nId];
 | 
				
			||||||
 | 
					            const Feature feature = ((const Feature*)features.ptr())[node.feature];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            uchar4 scaledRect = feature.rect;
 | 
				
			||||||
 | 
					            float threshold = rescale(level, scaledRect, feature.channel, node.threshold);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            int next = 1 + (int)(sum >= threshold);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            // leaves
 | 
				
			||||||
 | 
					            const Node leaf = ((const Node*)nodes.ptr())[nId + next];
 | 
				
			||||||
 | 
					            const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            scaledRect = fLeaf.rect;
 | 
				
			||||||
 | 
					            threshold = rescale(level, scaledRect, feature.channel, node.threshold);
 | 
				
			||||||
 | 
					            sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
 | 
				
			||||||
 | 
					            float impact = leaves(0, (st * 4) + lShift);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            detectionScore += impact;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if (detectionScore <= stage) break;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // if (!threadIdx.x && !threadIdx.y)// printf("%f %d\n", detectionScore, st);
 | 
				
			||||||
 | 
					    //     printf("x %d y %d: %d\n", x, y, st);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (st == stEnd)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        // printf("  got %d\n", st);
 | 
				
			||||||
 | 
					        uchar4 a;
 | 
				
			||||||
 | 
					        a.x = level.workRect.x;
 | 
				
			||||||
 | 
					        a.y = level.workRect.y;
 | 
				
			||||||
 | 
					        objects(0, threadIdx.x) = a;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, PtrStepSz<uchar4> objects,
 | 
				
			||||||
 | 
					                          cudaStream_t stream) const
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    // detection kernel
 | 
					    // detection kernel
 | 
				
			||||||
    dim3 block(32, 8, 1);
 | 
					    dim3 block(32, 8, 1);
 | 
				
			||||||
    dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 64);
 | 
					    // dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1);
 | 
				
			||||||
    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(ushort));
 | 
					    dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1);
 | 
				
			||||||
 | 
					    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(ushort), objects);
 | 
				
			||||||
 | 
					    cudaSafeCall( cudaGetLastError() );
 | 
				
			||||||
    if (!stream)
 | 
					    if (!stream)
 | 
				
			||||||
        cudaSafeCall( cudaDeviceSynchronize() );
 | 
					        cudaSafeCall( cudaDeviceSynchronize() );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream)
 | 
					void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    // color convertin kernel
 | 
					//     // // color convertin kernel
 | 
				
			||||||
    dim3 block(32, 8);
 | 
					//     // dim3 block(32, 8);
 | 
				
			||||||
    dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8);
 | 
					//     // dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS);
 | 
					//     // uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS);
 | 
				
			||||||
    device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels,
 | 
					//     // device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels,
 | 
				
			||||||
                                                    rgb.step / sizeof(uchar3), dmem.step);
 | 
					//     //                                                 rgb.step / sizeof(uchar3), dmem.step);
 | 
				
			||||||
    cudaSafeCall( cudaGetLastError());
 | 
					//     // cudaSafeCall( cudaGetLastError());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // hog calculation kernel
 | 
					//     // // hog calculation kernel
 | 
				
			||||||
    channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS);
 | 
					//     // channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS);
 | 
				
			||||||
    device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling);
 | 
					//     // device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling);
 | 
				
			||||||
    cudaSafeCall( cudaGetLastError() );
 | 
					//     // cudaSafeCall( cudaGetLastError() );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    const int shrWidth  = FRAME_WIDTH / shrinkage;
 | 
					//     // const int shrWidth  = FRAME_WIDTH / shrinkage;
 | 
				
			||||||
    const int shrHeight = FRAME_HEIGHT / shrinkage;
 | 
					//     // const int shrHeight = FRAME_HEIGHT / shrinkage;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // decimate kernel
 | 
					//     // // decimate kernel
 | 
				
			||||||
    grid = dim3(shrWidth / 32, shrHeight / 8);
 | 
					//     // grid = dim3(shrWidth / 32, shrHeight / 8);
 | 
				
			||||||
    device::decimate<4><<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
 | 
					//     // device::decimate<4><<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
 | 
				
			||||||
    cudaSafeCall( cudaGetLastError() );
 | 
					//     // cudaSafeCall( cudaGetLastError() );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // integrate rows
 | 
					//     // // integrate rows
 | 
				
			||||||
    block = dim3(shrWidth, 1);
 | 
					//     // block = dim3(shrWidth, 1);
 | 
				
			||||||
    grid = dim3(shrHeight * HOG_LUV_BINS, 1);
 | 
					//     // grid = dim3(shrHeight * HOG_LUV_BINS, 1);
 | 
				
			||||||
    device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(),
 | 
					//     // device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(),
 | 
				
			||||||
        shrunk.step, hogluv.step / sizeof(ushort));
 | 
					//     //     shrunk.step, hogluv.step / sizeof(ushort));
 | 
				
			||||||
    cudaSafeCall( cudaGetLastError() );
 | 
					//     // cudaSafeCall( cudaGetLastError() );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // integrate cols
 | 
					//     // // integrate cols
 | 
				
			||||||
    block = dim3(128, 1);
 | 
					//     // block = dim3(128, 1);
 | 
				
			||||||
    grid = dim3(shrWidth * HOG_LUV_BINS, 1);
 | 
					//     // grid = dim3(shrWidth * HOG_LUV_BINS, 1);
 | 
				
			||||||
    device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort));
 | 
					//     // device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort));
 | 
				
			||||||
    cudaSafeCall( cudaGetLastError() );
 | 
					//     // cudaSafeCall( cudaGetLastError() );
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
@@ -59,6 +59,42 @@ using cv::gpu::PtrStepSzf;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
typedef unsigned char uchar;
 | 
					typedef unsigned char uchar;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct __align__(16) Octave
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    ushort index;
 | 
				
			||||||
 | 
					    ushort stages;
 | 
				
			||||||
 | 
					    ushort shrinkage;
 | 
				
			||||||
 | 
					    ushort2 size;
 | 
				
			||||||
 | 
					    float scale;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
 | 
				
			||||||
 | 
					    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct __align__(8) Level //is actually 24 bytes
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    int octave;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // float origScale; //not actually used
 | 
				
			||||||
 | 
					    float relScale;
 | 
				
			||||||
 | 
					    float shrScale;   // used for marking detection
 | 
				
			||||||
 | 
					    float scaling[2]; // calculated according to Dollal paper
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // for 640x480 we can not get overflow
 | 
				
			||||||
 | 
					    uchar2 workRect;
 | 
				
			||||||
 | 
					    uchar2 objSize;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
 | 
				
			||||||
 | 
					    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        workRect.x = round(w / (float)oct.shrinkage);
 | 
				
			||||||
 | 
					        workRect.y = round(h / (float)oct.shrinkage);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        objSize.x  = round(oct.size.x * relScale);
 | 
				
			||||||
 | 
					        objSize.y  = round(oct.size.y * relScale);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct Cascade
 | 
					struct Cascade
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    Cascade() {}
 | 
					    Cascade() {}
 | 
				
			||||||
@@ -66,8 +102,10 @@ struct Cascade
 | 
				
			|||||||
        const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
 | 
					        const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
 | 
				
			||||||
    : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
 | 
					    : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const;
 | 
					    void detect(const cv::gpu::PtrStepSzb& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
 | 
				
			||||||
    void __device detectAt() const;
 | 
					    void __device detectAt(const uchar* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
 | 
				
			||||||
 | 
					    float __device rescale(const icf::Level& level, uchar4& scaledRect,
 | 
				
			||||||
 | 
					                           const int channel, const float threshold) const;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    PtrStepSzb octaves;
 | 
					    PtrStepSzb octaves;
 | 
				
			||||||
    PtrStepSzf stages;
 | 
					    PtrStepSzf stages;
 | 
				
			||||||
@@ -108,18 +146,6 @@ struct ChannelStorage
 | 
				
			|||||||
    static const float magnitudeScaling = 1.f ;// / sqrt(2);
 | 
					    static const float magnitudeScaling = 1.f ;// / sqrt(2);
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct __align__(16) Octave
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    ushort index;
 | 
					 | 
				
			||||||
    ushort stages;
 | 
					 | 
				
			||||||
    ushort shrinkage;
 | 
					 | 
				
			||||||
    ushort2 size;
 | 
					 | 
				
			||||||
    float scale;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
 | 
					 | 
				
			||||||
    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct __align__(8) Node
 | 
					struct __align__(8) Node
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    int feature;
 | 
					    int feature;
 | 
				
			||||||
@@ -135,30 +161,6 @@ struct __align__(8) Feature
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
 | 
					    Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					 | 
				
			||||||
struct __align__(8) Level //is actually 24 bytes
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    int octave;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // float origScale; //not actually used
 | 
					 | 
				
			||||||
    float relScale;
 | 
					 | 
				
			||||||
    float shrScale;   // used for marking detection
 | 
					 | 
				
			||||||
    float scaling[2]; // calculated according to Dollal paper
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // for 640x480 we can not get overflow
 | 
					 | 
				
			||||||
    uchar2 workRect;
 | 
					 | 
				
			||||||
    uchar2 objSize;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
 | 
					 | 
				
			||||||
    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
        workRect.x = round(w / (float)oct.shrinkage);
 | 
					 | 
				
			||||||
        workRect.y = round(h / (float)oct.shrinkage);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        objSize.x  = round(oct.size.x * relScale);
 | 
					 | 
				
			||||||
        objSize.y  = round(oct.size.y * relScale);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
}}}
 | 
					}}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
@@ -41,6 +41,7 @@
 | 
				
			|||||||
//M*/
 | 
					//M*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <precomp.hpp>
 | 
					#include <precomp.hpp>
 | 
				
			||||||
 | 
					#include "opencv2/highgui/highgui.hpp"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if !defined (HAVE_CUDA)
 | 
					#if !defined (HAVE_CUDA)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -58,6 +59,12 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <icf.hpp>
 | 
					#include <icf.hpp>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace cv { namespace gpu { namespace device {
 | 
				
			||||||
 | 
					namespace icf {
 | 
				
			||||||
 | 
					    void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					}}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct cv::gpu::SoftCascade::Filds
 | 
					struct cv::gpu::SoftCascade::Filds
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    // scales range
 | 
					    // scales range
 | 
				
			||||||
@@ -81,6 +88,16 @@ struct cv::gpu::SoftCascade::Filds
 | 
				
			|||||||
    // 161x121x10
 | 
					    // 161x121x10
 | 
				
			||||||
    GpuMat hogluv;
 | 
					    GpuMat hogluv;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // will be removed in final version
 | 
				
			||||||
 | 
					    // temporial mat for cvtColor
 | 
				
			||||||
 | 
					    GpuMat luv;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // temporial mat for integrall
 | 
				
			||||||
 | 
					    GpuMat integralBuffer;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // temp matrix for sobel and cartToPolar
 | 
				
			||||||
 | 
					    GpuMat dfdx, dfdy, angle, mag, nmag, nangle;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::vector<float> scales;
 | 
					    std::vector<float> scales;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    icf::Cascade cascade;
 | 
					    icf::Cascade cascade;
 | 
				
			||||||
@@ -100,9 +117,9 @@ struct cv::gpu::SoftCascade::Filds
 | 
				
			|||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    bool fill(const FileNode &root, const float mins, const float maxs);
 | 
					    bool fill(const FileNode &root, const float mins, const float maxs);
 | 
				
			||||||
    void detect(cudaStream_t stream) const
 | 
					    void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
        cascade.detect(hogluv, stream);
 | 
					        cascade.detect(hogluv, objects, stream);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
private:
 | 
					private:
 | 
				
			||||||
@@ -284,7 +301,18 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
 | 
				
			|||||||
    // allocate buffers
 | 
					    // allocate buffers
 | 
				
			||||||
    dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
 | 
					    dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
 | 
				
			||||||
    shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
 | 
					    shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
 | 
				
			||||||
    hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1);
 | 
					    // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1);
 | 
				
			||||||
 | 
					    hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1);
 | 
				
			||||||
 | 
					    luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
 | 
				
			||||||
 | 
					    integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 | 
				
			||||||
 | 
					    dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 | 
				
			||||||
 | 
					    angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 | 
				
			||||||
 | 
					    mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 | 
				
			||||||
 | 
					    nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
 | 
					    storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
 | 
				
			||||||
    return true;
 | 
					    return true;
 | 
				
			||||||
@@ -393,21 +421,71 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
 | 
				
			|||||||
    return true;
 | 
					    return true;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /*rois*/,
 | 
					void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
 | 
				
			||||||
                                GpuMat& /*objects*/, const int /*rejectfactor*/, Stream s)
 | 
					                                GpuMat& objects, const int /*rejectfactor*/, Stream s)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    // only color images are supperted
 | 
					    // only color images are supperted
 | 
				
			||||||
    CV_Assert(image.type() == CV_8UC3);
 | 
					    CV_Assert(colored.type() == CV_8UC3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // only this window size allowed
 | 
					    // // only this window size allowed
 | 
				
			||||||
    CV_Assert(image.cols == 640 && image.rows == 480);
 | 
					    CV_Assert(colored.cols == 640 && colored.rows == 480);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Filds& flds = *filds;
 | 
					    Filds& flds = *filds;
 | 
				
			||||||
 | 
					    GpuMat& dmem = flds.dmem;
 | 
				
			||||||
 | 
					    cudaMemset(dmem.data, 0, dmem.step * dmem.rows);
 | 
				
			||||||
 | 
					    GpuMat& shrunk = flds.shrunk;
 | 
				
			||||||
 | 
					    int w = shrunk.cols;
 | 
				
			||||||
 | 
					    int h = colored.rows / flds.storage.shrinkage;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cudaStream_t stream = StreamAccessor::getStream(s);
 | 
					    cudaStream_t stream = StreamAccessor::getStream(s);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    flds.storage.frame(image, stream);
 | 
					    std::vector<GpuMat> splited;
 | 
				
			||||||
    flds.detect(stream);
 | 
					    for(int i = 0; i < 3; ++i)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows)));
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    //create hog
 | 
				
			||||||
 | 
					    cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25);
 | 
				
			||||||
 | 
					    cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag);
 | 
				
			||||||
 | 
					    cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows));
 | 
				
			||||||
 | 
					    flds.nmag.convertTo(magCannel, CV_8UC1);
 | 
				
			||||||
 | 
					    device::icf::fillBins(dmem, flds.nangle);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // create luv
 | 
				
			||||||
 | 
					    cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
 | 
				
			||||||
 | 
					    cv::gpu::split(flds.luv, splited);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS));
 | 
				
			||||||
 | 
					    cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
 | 
				
			||||||
 | 
					    // cv::Mat cpu(plane);
 | 
				
			||||||
 | 
					    // cv::imshow("channels", cpu);
 | 
				
			||||||
 | 
					    // cv::waitKey(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // fer debug purpose
 | 
				
			||||||
 | 
					    // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        GpuMat channel(shrunk, cv::Rect(0, h  * i, w, h ));
 | 
				
			||||||
 | 
					        GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1));
 | 
				
			||||||
 | 
					        cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // detection
 | 
				
			||||||
 | 
					    flds.detect(objects, stream);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // flds.storage.frame(colored, stream);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
		Reference in New Issue
	
	Block a user