[~] Refactored device reduction for better reuse (now with functor semantics)
This commit is contained in:
parent
56531c6161
commit
3396a2ec1a
@ -59,6 +59,7 @@
|
||||
#include <cstdio>
|
||||
|
||||
#include "NCV.hpp"
|
||||
#include "NCVAlg.hpp"
|
||||
#include "NPP_staging/NPP_staging.hpp"
|
||||
#include "NCVRuntimeTemplates.hpp"
|
||||
#include "NCVHaarObjectDetection.hpp"
|
||||
@ -84,11 +85,6 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
|
||||
pos += K_WARP_SIZE;
|
||||
s_Data[pos] = idata;
|
||||
|
||||
//for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
|
||||
//{
|
||||
// s_Data[pos] += s_Data[pos - offset];
|
||||
//}
|
||||
|
||||
s_Data[pos] += s_Data[pos - 1];
|
||||
s_Data[pos] += s_Data[pos - 2];
|
||||
s_Data[pos] += s_Data[pos - 4];
|
||||
@ -234,60 +230,6 @@ __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
|
||||
}
|
||||
|
||||
|
||||
__device__ Ncv32f reduceSpecialization(Ncv32f partialSum)
|
||||
{
|
||||
__shared__ volatile Ncv32f reductor[NUM_THREADS_CLASSIFIERPARALLEL];
|
||||
reductor[threadIdx.x] = partialSum;
|
||||
__syncthreads();
|
||||
|
||||
#if defined CPU_FP_COMPLIANCE
|
||||
if (!threadIdx.x)
|
||||
{
|
||||
Ncv32f sum = 0.0f;
|
||||
for (int i=0; i<NUM_THREADS_CLASSIFIERPARALLEL; i++)
|
||||
{
|
||||
sum += reductor[i];
|
||||
}
|
||||
reductor[0] = sum;
|
||||
}
|
||||
#else
|
||||
|
||||
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 8
|
||||
if (threadIdx.x < 128)
|
||||
{
|
||||
reductor[threadIdx.x] += reductor[threadIdx.x + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
#endif
|
||||
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7
|
||||
if (threadIdx.x < 64)
|
||||
{
|
||||
reductor[threadIdx.x] += reductor[threadIdx.x + 64];
|
||||
}
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6
|
||||
reductor[threadIdx.x] += reductor[threadIdx.x + 32];
|
||||
#endif
|
||||
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5
|
||||
reductor[threadIdx.x] += reductor[threadIdx.x + 16];
|
||||
#endif
|
||||
reductor[threadIdx.x] += reductor[threadIdx.x + 8];
|
||||
reductor[threadIdx.x] += reductor[threadIdx.x + 4];
|
||||
reductor[threadIdx.x] += reductor[threadIdx.x + 2];
|
||||
reductor[threadIdx.x] += reductor[threadIdx.x + 1];
|
||||
}
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
return reductor[0];
|
||||
}
|
||||
|
||||
|
||||
__device__ Ncv32u d_outMaskPosition;
|
||||
|
||||
|
||||
@ -623,7 +565,14 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
|
||||
curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
|
||||
}
|
||||
|
||||
Ncv32f finalStageSum = reduceSpecialization(curStageSum);
|
||||
struct functorAddValues
|
||||
{
|
||||
__device__ void reduce(Ncv32f &in1out, Ncv32f &in2)
|
||||
{
|
||||
in1out += in2;
|
||||
}
|
||||
};
|
||||
Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
|
||||
|
||||
if (finalStageSum < stageThreshold)
|
||||
{
|
||||
|
97
modules/gpu/src/nvidia/core/NCVAlg.hpp
Normal file
97
modules/gpu/src/nvidia/core/NCVAlg.hpp
Normal file
@ -0,0 +1,97 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef _ncv_alg_hpp_
|
||||
#define _ncv_alg_hpp_
|
||||
|
||||
#include "NCV.hpp"
|
||||
|
||||
|
||||
template<typename T>
|
||||
static T divUp(T a, T b)
|
||||
{
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
|
||||
template<typename Tdata, class Tfunc, Ncv32u nThreads>
|
||||
static __device__ Tdata subReduce(Tdata threadElem)
|
||||
{
|
||||
Tfunc functor;
|
||||
|
||||
__shared__ Tdata reduceArr[nThreads];
|
||||
reduceArr[threadIdx.x] = threadElem;
|
||||
__syncthreads();
|
||||
|
||||
if (nThreads >= 256 && threadIdx.x < 128)
|
||||
{
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 128]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (nThreads >= 128 && threadIdx.x < 64)
|
||||
{
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 64]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
if (nThreads >= 64)
|
||||
{
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
|
||||
}
|
||||
if (nThreads >= 32)
|
||||
{
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
|
||||
}
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
return reduceArr[0];
|
||||
}
|
||||
|
||||
|
||||
#endif //_ncv_alg_hpp_
|
Loading…
x
Reference in New Issue
Block a user