3rd attempt to prepare patch with improved OpenCL kernels of CascadeClassifier.
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -3,6 +3,72 @@
|
||||
namespace cv
|
||||
{
|
||||
|
||||
class FeatureEvaluator
|
||||
{
|
||||
public:
|
||||
enum
|
||||
{
|
||||
HAAR = 0,
|
||||
LBP = 1,
|
||||
HOG = 2
|
||||
};
|
||||
|
||||
struct ScaleData
|
||||
{
|
||||
ScaleData() { scale = 0.f; layer_ofs = ystep = 0; }
|
||||
Size getWorkingSize(Size winSize) const
|
||||
{
|
||||
return Size(std::max(szi.width - winSize.width, 0),
|
||||
std::max(szi.height - winSize.height, 0));
|
||||
}
|
||||
|
||||
float scale;
|
||||
Size szi;
|
||||
int layer_ofs, ystep;
|
||||
};
|
||||
|
||||
virtual ~FeatureEvaluator();
|
||||
|
||||
virtual bool read(const FileNode& node, Size origWinSize);
|
||||
virtual Ptr<FeatureEvaluator> clone() const;
|
||||
virtual int getFeatureType() const;
|
||||
int getNumChannels() const { return nchannels; }
|
||||
|
||||
virtual bool setImage(InputArray img, const std::vector<float>& scales);
|
||||
virtual bool setWindow(Point p, int scaleIdx);
|
||||
const ScaleData& getScaleData(int scaleIdx) const
|
||||
{
|
||||
CV_Assert( 0 <= scaleIdx && scaleIdx < (int)scaleData->size());
|
||||
return scaleData->at(scaleIdx);
|
||||
}
|
||||
virtual void getUMats(std::vector<UMat>& bufs);
|
||||
virtual void getMats();
|
||||
|
||||
Size getLocalSize() const { return localSize; }
|
||||
Size getLocalBufSize() const { return lbufSize; }
|
||||
|
||||
virtual float calcOrd(int featureIdx) const;
|
||||
virtual int calcCat(int featureIdx) const;
|
||||
|
||||
static Ptr<FeatureEvaluator> create(int type);
|
||||
|
||||
protected:
|
||||
enum { SBUF_VALID=1, USBUF_VALID=2 };
|
||||
int sbufFlag;
|
||||
|
||||
bool updateScaleData( Size imgsz, const std::vector<float>& _scales );
|
||||
virtual void computeChannels( int, InputArray ) {}
|
||||
virtual void computeOptFeatures() {}
|
||||
|
||||
Size origWinSize, sbufSize, localSize, lbufSize;
|
||||
int nchannels;
|
||||
Mat sbuf, rbuf;
|
||||
UMat urbuf, usbuf, ufbuf, uscaleData;
|
||||
|
||||
Ptr<std::vector<ScaleData> > scaleData;
|
||||
};
|
||||
|
||||
|
||||
class CascadeClassifierImpl : public BaseCascadeClassifier
|
||||
{
|
||||
public:
|
||||
@@ -54,9 +120,8 @@ protected:
|
||||
int yStep, double factor, std::vector<Rect>& candidates,
|
||||
std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
|
||||
Size sumSize0, bool outputRejectLevels = false );
|
||||
bool ocl_detectSingleScale( InputArray image, Size processingRectSize,
|
||||
int yStep, double factor, Size sumSize0 );
|
||||
|
||||
bool ocl_detectMultiScaleNoGrouping( const std::vector<float>& scales,
|
||||
std::vector<Rect>& candidates );
|
||||
|
||||
void detectMultiScaleNoGrouping( InputArray image, std::vector<Rect>& candidates,
|
||||
std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
|
||||
@@ -72,6 +137,7 @@ protected:
|
||||
};
|
||||
|
||||
friend class CascadeClassifierInvoker;
|
||||
friend class SparseCascadeClassifierInvoker;
|
||||
|
||||
template<class FEval>
|
||||
friend int predictOrdered( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
|
||||
@@ -85,7 +151,7 @@ protected:
|
||||
template<class FEval>
|
||||
friend int predictCategoricalStump( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
|
||||
|
||||
int runAt( Ptr<FeatureEvaluator>& feval, Point pt, double& weight );
|
||||
int runAt( Ptr<FeatureEvaluator>& feval, Point pt, int scaleIdx, double& weight );
|
||||
|
||||
class Data
|
||||
{
|
||||
@@ -126,12 +192,10 @@ protected:
|
||||
|
||||
bool read(const FileNode &node);
|
||||
|
||||
bool isStumpBased() const { return maxNodesPerTree == 1; }
|
||||
|
||||
int stageType;
|
||||
int featureType;
|
||||
int ncategories;
|
||||
int maxNodesPerTree;
|
||||
int minNodesPerTree, maxNodesPerTree;
|
||||
Size origWinSize;
|
||||
|
||||
std::vector<Stage> stages;
|
||||
@@ -148,7 +212,7 @@ protected:
|
||||
|
||||
Ptr<MaskGenerator> maskGenerator;
|
||||
UMat ugrayImage, uimageBuffer;
|
||||
UMat ufacepos, ustages, ustumps, usubsets;
|
||||
UMat ufacepos, ustages, unodes, uleaves, usubsets;
|
||||
ocl::Kernel haarKernel, lbpKernel;
|
||||
bool tryOpenCL;
|
||||
|
||||
@@ -268,7 +332,6 @@ public:
|
||||
|
||||
enum { RECT_NUM = Feature::RECT_NUM };
|
||||
float calc( const int* pwin ) const;
|
||||
|
||||
void setOffsets( const Feature& _f, int step, int tofs );
|
||||
|
||||
int ofs[RECT_NUM][4];
|
||||
@@ -278,35 +341,34 @@ public:
|
||||
HaarEvaluator();
|
||||
virtual ~HaarEvaluator();
|
||||
|
||||
virtual bool read( const FileNode& node );
|
||||
virtual bool read( const FileNode& node, Size origWinSize);
|
||||
virtual Ptr<FeatureEvaluator> clone() const;
|
||||
virtual int getFeatureType() const { return FeatureEvaluator::HAAR; }
|
||||
|
||||
virtual bool setImage(InputArray, Size origWinSize, Size sumSize);
|
||||
virtual bool setWindow(Point pt);
|
||||
virtual Rect getNormRect() const;
|
||||
virtual void getUMats(std::vector<UMat>& bufs);
|
||||
virtual bool setWindow(Point p, int scaleIdx);
|
||||
Rect getNormRect() const;
|
||||
int getSquaresOffset() const;
|
||||
|
||||
double operator()(int featureIdx) const
|
||||
float operator()(int featureIdx) const
|
||||
{ return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; }
|
||||
virtual double calcOrd(int featureIdx) const
|
||||
virtual float calcOrd(int featureIdx) const
|
||||
{ return (*this)(featureIdx); }
|
||||
|
||||
protected:
|
||||
Size origWinSize, sumSize0;
|
||||
virtual void computeChannels( int i, InputArray img );
|
||||
virtual void computeOptFeatures();
|
||||
|
||||
Ptr<std::vector<Feature> > features;
|
||||
Ptr<std::vector<OptFeature> > optfeatures;
|
||||
OptFeature* optfeaturesPtr; // optimization
|
||||
Ptr<std::vector<OptFeature> > optfeatures_lbuf;
|
||||
bool hasTiltedFeatures;
|
||||
|
||||
Mat sum0, sum, sqsum0, sqsum;
|
||||
UMat usum0, usum, usqsum0, usqsum, ufbuf;
|
||||
|
||||
int tofs, sqofs;
|
||||
Vec4i nofs;
|
||||
Rect normrect;
|
||||
int nofs[4];
|
||||
|
||||
const int* pwin;
|
||||
double varianceNormFactor;
|
||||
OptFeature* optfeaturesPtr; // optimization
|
||||
float varianceNormFactor;
|
||||
};
|
||||
|
||||
inline HaarEvaluator::Feature :: Feature()
|
||||
@@ -336,28 +398,6 @@ inline float HaarEvaluator::OptFeature :: calc( const int* ptr ) const
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step, int tofs )
|
||||
{
|
||||
weight[0] = _f.rect[0].weight;
|
||||
weight[1] = _f.rect[1].weight;
|
||||
weight[2] = _f.rect[2].weight;
|
||||
|
||||
Rect r2 = weight[2] > 0 ? _f.rect[2].r : Rect(0,0,0,0);
|
||||
if (_f.tilted)
|
||||
{
|
||||
CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], tofs, _f.rect[0].r, step );
|
||||
CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], tofs, _f.rect[1].r, step );
|
||||
CV_TILTED_PTRS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], tofs, r2, step );
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
|
||||
CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
|
||||
CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, r2, step );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------- LBPEvaluator -------------------------------------
|
||||
|
||||
class LBPEvaluator : public FeatureEvaluator
|
||||
@@ -367,7 +407,7 @@ public:
|
||||
{
|
||||
Feature();
|
||||
Feature( int x, int y, int _block_w, int _block_h ) :
|
||||
rect(x, y, _block_w, _block_h) {}
|
||||
rect(x, y, _block_w, _block_h) {}
|
||||
|
||||
bool read(const FileNode& node );
|
||||
|
||||
@@ -386,27 +426,25 @@ public:
|
||||
LBPEvaluator();
|
||||
virtual ~LBPEvaluator();
|
||||
|
||||
virtual bool read( const FileNode& node );
|
||||
virtual bool read( const FileNode& node, Size origWinSize );
|
||||
virtual Ptr<FeatureEvaluator> clone() const;
|
||||
virtual int getFeatureType() const { return FeatureEvaluator::LBP; }
|
||||
|
||||
virtual bool setImage(InputArray image, Size _origWinSize, Size);
|
||||
virtual bool setWindow(Point pt);
|
||||
virtual void getUMats(std::vector<UMat>& bufs);
|
||||
virtual bool setWindow(Point p, int scaleIdx);
|
||||
|
||||
int operator()(int featureIdx) const
|
||||
{ return optfeaturesPtr[featureIdx].calc(pwin); }
|
||||
virtual int calcCat(int featureIdx) const
|
||||
{ return (*this)(featureIdx); }
|
||||
protected:
|
||||
Size origWinSize, sumSize0;
|
||||
virtual void computeChannels( int i, InputArray img );
|
||||
virtual void computeOptFeatures();
|
||||
|
||||
Ptr<std::vector<Feature> > features;
|
||||
Ptr<std::vector<OptFeature> > optfeatures;
|
||||
Ptr<std::vector<OptFeature> > optfeatures_lbuf;
|
||||
OptFeature* optfeaturesPtr; // optimization
|
||||
|
||||
Mat sum0, sum;
|
||||
UMat usum0, usum, ufbuf;
|
||||
|
||||
const int* pwin;
|
||||
};
|
||||
|
||||
@@ -436,98 +474,6 @@ inline int LBPEvaluator::OptFeature :: calc( const int* p ) const
|
||||
(CALC_SUM_OFS_( ofs[4], ofs[5], ofs[8], ofs[9], p ) >= cval ? 1 : 0);
|
||||
}
|
||||
|
||||
inline void LBPEvaluator::OptFeature :: setOffsets( const Feature& _f, int step )
|
||||
{
|
||||
Rect tr = _f.rect;
|
||||
CV_SUM_OFS( ofs[0], ofs[1], ofs[4], ofs[5], 0, tr, step );
|
||||
tr.x += 2*_f.rect.width;
|
||||
CV_SUM_OFS( ofs[2], ofs[3], ofs[6], ofs[7], 0, tr, step );
|
||||
tr.y += 2*_f.rect.height;
|
||||
CV_SUM_OFS( ofs[10], ofs[11], ofs[14], ofs[15], 0, tr, step );
|
||||
tr.x -= 2*_f.rect.width;
|
||||
CV_SUM_OFS( ofs[8], ofs[9], ofs[12], ofs[13], 0, tr, step );
|
||||
}
|
||||
|
||||
//---------------------------------------------- HOGEvaluator -------------------------------------------
|
||||
|
||||
class HOGEvaluator : public FeatureEvaluator
|
||||
{
|
||||
public:
|
||||
struct Feature
|
||||
{
|
||||
Feature();
|
||||
float calc( int offset ) const;
|
||||
void updatePtrs( const std::vector<Mat>& _hist, const Mat &_normSum );
|
||||
bool read( const FileNode& node );
|
||||
|
||||
enum { CELL_NUM = 4, BIN_NUM = 9 };
|
||||
|
||||
Rect rect[CELL_NUM];
|
||||
int featComponent; //component index from 0 to 35
|
||||
const float* pF[4]; //for feature calculation
|
||||
const float* pN[4]; //for normalization calculation
|
||||
};
|
||||
HOGEvaluator();
|
||||
virtual ~HOGEvaluator();
|
||||
virtual bool read( const FileNode& node );
|
||||
virtual Ptr<FeatureEvaluator> clone() const;
|
||||
virtual int getFeatureType() const { return FeatureEvaluator::HOG; }
|
||||
virtual bool setImage( InputArray image, Size winSize, Size );
|
||||
virtual bool setWindow( Point pt );
|
||||
double operator()(int featureIdx) const
|
||||
{
|
||||
return featuresPtr[featureIdx].calc(offset);
|
||||
}
|
||||
virtual double calcOrd( int featureIdx ) const
|
||||
{
|
||||
return (*this)(featureIdx);
|
||||
}
|
||||
|
||||
private:
|
||||
virtual void integralHistogram( const Mat& srcImage, std::vector<Mat> &histogram, Mat &norm, int nbins ) const;
|
||||
|
||||
Size origWinSize;
|
||||
Ptr<std::vector<Feature> > features;
|
||||
Feature* featuresPtr;
|
||||
std::vector<Mat> hist;
|
||||
Mat normSum;
|
||||
int offset;
|
||||
};
|
||||
|
||||
inline HOGEvaluator::Feature :: Feature()
|
||||
{
|
||||
rect[0] = rect[1] = rect[2] = rect[3] = Rect();
|
||||
pF[0] = pF[1] = pF[2] = pF[3] = 0;
|
||||
pN[0] = pN[1] = pN[2] = pN[3] = 0;
|
||||
featComponent = 0;
|
||||
}
|
||||
|
||||
inline float HOGEvaluator::Feature :: calc( int _offset ) const
|
||||
{
|
||||
float res = CALC_SUM(pF, _offset);
|
||||
float normFactor = CALC_SUM(pN, _offset);
|
||||
res = (res > 0.001f) ? (res / ( normFactor + 0.001f) ) : 0.f;
|
||||
return res;
|
||||
}
|
||||
|
||||
inline void HOGEvaluator::Feature :: updatePtrs( const std::vector<Mat> &_hist, const Mat &_normSum )
|
||||
{
|
||||
int binIdx = featComponent % BIN_NUM;
|
||||
int cellIdx = featComponent / BIN_NUM;
|
||||
Rect normRect = Rect( rect[0].x, rect[0].y, 2*rect[0].width, 2*rect[0].height );
|
||||
|
||||
const float* featBuf = (const float*)_hist[binIdx].data;
|
||||
size_t featStep = _hist[0].step / sizeof(featBuf[0]);
|
||||
|
||||
const float* normBuf = (const float*)_normSum.data;
|
||||
size_t normStep = _normSum.step / sizeof(normBuf[0]);
|
||||
|
||||
CV_SUM_PTRS( pF[0], pF[1], pF[2], pF[3], featBuf, rect[cellIdx], featStep );
|
||||
CV_SUM_PTRS( pN[0], pN[1], pN[2], pN[3], normBuf, normRect, normStep );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------- predictor functions -------------------------------------
|
||||
|
||||
@@ -662,11 +608,7 @@ inline int predictCategoricalStump( CascadeClassifierImpl& cascade,
|
||||
const CascadeClassifierImpl::Data::Stump* cascadeStumps = &cascade.data.stumps[0];
|
||||
const CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];
|
||||
|
||||
#ifdef HAVE_TEGRA_OPTIMIZATION
|
||||
float tmp = 0; // float accumulator -- float operations are quicker
|
||||
#else
|
||||
double tmp = 0;
|
||||
#endif
|
||||
float tmp = 0;
|
||||
for( int si = 0; si < nstages; si++ )
|
||||
{
|
||||
const CascadeClassifierImpl::Data::Stage& stage = cascadeStages[si];
|
||||
|
@@ -1,6 +1,18 @@
|
||||
///////////////////////////// OpenCL kernels for face detection //////////////////////////////
|
||||
////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
|
||||
|
||||
//
|
||||
// the code has been derived from the OpenCL Haar cascade kernel by
|
||||
//
|
||||
// Niko Li, newlife20080214@gmail.com
|
||||
// Wang Weiyan, wangweiyanster@gmail.com
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Nathan, liujun@multicorewareinc.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
// Erping Pang, erping@multicorewareinc.com
|
||||
//
|
||||
|
||||
|
||||
typedef struct __attribute__((aligned(4))) OptHaarFeature
|
||||
{
|
||||
int4 ofs[3] __attribute__((aligned (4)));
|
||||
@@ -20,6 +32,12 @@ typedef struct __attribute__((aligned(4))) Stump
|
||||
}
|
||||
Stump;
|
||||
|
||||
typedef struct __attribute__((aligned(4))) Node
|
||||
{
|
||||
int4 n __attribute__((aligned (4)));
|
||||
}
|
||||
Node;
|
||||
|
||||
typedef struct __attribute__((aligned (4))) Stage
|
||||
{
|
||||
int first __attribute__((aligned (4)));
|
||||
@@ -28,151 +46,614 @@ typedef struct __attribute__((aligned (4))) Stage
|
||||
}
|
||||
Stage;
|
||||
|
||||
__kernel void runHaarClassifierStump(
|
||||
typedef struct __attribute__((aligned (4))) ScaleData
|
||||
{
|
||||
float scale __attribute__((aligned (4)));
|
||||
int szi_width __attribute__((aligned (4)));
|
||||
int szi_height __attribute__((aligned (4)));
|
||||
int layer_ofs __attribute__((aligned (4)));
|
||||
int ystep __attribute__((aligned (4)));
|
||||
}
|
||||
ScaleData;
|
||||
|
||||
#ifndef SUM_BUF_SIZE
|
||||
#define SUM_BUF_SIZE 0
|
||||
#endif
|
||||
|
||||
#ifndef NODE_COUNT
|
||||
#define NODE_COUNT 1
|
||||
#endif
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
|
||||
void runHaarClassifier(
|
||||
int nscales, __global const ScaleData* scaleData,
|
||||
__global const int* sum,
|
||||
int sumstep, int sumoffset,
|
||||
__global const int* sqsum,
|
||||
int sqsumstep, int sqsumoffset,
|
||||
int _sumstep, int sumoffset,
|
||||
__global const OptHaarFeature* optfeatures,
|
||||
|
||||
int nstages,
|
||||
int splitstage, int nstages,
|
||||
__global const Stage* stages,
|
||||
__global const Stump* stumps,
|
||||
__global const Node* nodes,
|
||||
__global const float* leaves0,
|
||||
|
||||
volatile __global int* facepos,
|
||||
int2 imgsize, int xyscale, float factor,
|
||||
int4 normrect, int2 windowsize, int maxFaces)
|
||||
int4 normrect, int sqofs, int2 windowsize, int maxFaces)
|
||||
{
|
||||
int ix = get_global_id(0)*xyscale;
|
||||
int iy = get_global_id(1)*xyscale;
|
||||
sumstep /= sizeof(int);
|
||||
sqsumstep /= sizeof(int);
|
||||
int lx = get_local_id(0);
|
||||
int ly = get_local_id(1);
|
||||
int groupIdx = get_group_id(0);
|
||||
int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
|
||||
int scaleIdx, tileIdx, stageIdx;
|
||||
int sumstep = (int)(_sumstep/sizeof(int));
|
||||
int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),
|
||||
mad24(normrect.y, sumstep, normrect.x + normrect.z),
|
||||
mad24(normrect.y + normrect.w, sumstep, normrect.x),
|
||||
mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));
|
||||
int normarea = normrect.z * normrect.w;
|
||||
float invarea = 1.f/normarea;
|
||||
int lidx = ly*LOCAL_SIZE_X + lx;
|
||||
|
||||
if( ix < imgsize.x && iy < imgsize.y )
|
||||
#if SUM_BUF_SIZE > 0
|
||||
int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),
|
||||
mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),
|
||||
mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),
|
||||
mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));
|
||||
#else
|
||||
int4 nofs = nofs0;
|
||||
#endif
|
||||
#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
|
||||
__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];
|
||||
#if SUM_BUF_SIZE > 0
|
||||
__local int* ibuf = lstore;
|
||||
__local int* lcount = ibuf + SUM_BUF_SIZE;
|
||||
#else
|
||||
__local int* lcount = lstore;
|
||||
#endif
|
||||
__local float* lnf = (__local float*)(lcount + 1);
|
||||
__local float* lpartsum = lnf + LOCAL_SIZE;
|
||||
__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
|
||||
|
||||
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
|
||||
{
|
||||
int stageIdx;
|
||||
__global const Stump* stump = stumps;
|
||||
__global const ScaleData* s = scaleData + scaleIdx;
|
||||
int ystep = s->ystep;
|
||||
int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
|
||||
int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
|
||||
(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
|
||||
int totalTiles = ntiles.x*ntiles.y;
|
||||
|
||||
__global const int* psum = sum + mad24(iy, sumstep, ix);
|
||||
__global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
|
||||
int normarea = normrect.z * normrect.w;
|
||||
float invarea = 1.f/normarea;
|
||||
float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
|
||||
pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
|
||||
float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
|
||||
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
|
||||
nf = nf > 0 ? nf : 1.f;
|
||||
|
||||
for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
|
||||
for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
|
||||
{
|
||||
int i, ntrees = stages[stageIdx].ntrees;
|
||||
float s = 0.f;
|
||||
for( i = 0; i < ntrees; i++, stump++ )
|
||||
{
|
||||
float4 st = stump->st;
|
||||
__global const OptHaarFeature* f = optfeatures + as_int(st.x);
|
||||
float4 weight = f->weight;
|
||||
int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
|
||||
int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
|
||||
int ix = lx, iy = ly;
|
||||
__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
|
||||
__global const int* psum1 = psum0 + mad24(iy, sumstep, ix);
|
||||
|
||||
int4 ofs = f->ofs[0];
|
||||
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
|
||||
ofs = f->ofs[1];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
|
||||
if( weight.z > 0 )
|
||||
if( ix0 >= worksize.x || iy0 >= worksize.y )
|
||||
continue;
|
||||
#if SUM_BUF_SIZE > 0
|
||||
for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
|
||||
{
|
||||
int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
|
||||
vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
|
||||
if( lidx == 0 )
|
||||
lcount[0] = 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
|
||||
{
|
||||
#if NODE_COUNT==1
|
||||
__global const Stump* stump = (__global const Stump*)nodes;
|
||||
#else
|
||||
__global const Node* node = nodes;
|
||||
__global const float* leaves = leaves0;
|
||||
#endif
|
||||
#if SUM_BUF_SIZE > 0
|
||||
__local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);
|
||||
#else
|
||||
__global const int* psum = psum1;
|
||||
#endif
|
||||
|
||||
__global const float* psqsum = (__global const float*)(psum1 + sqofs);
|
||||
float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;
|
||||
float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;
|
||||
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
|
||||
nf = nf > 0 ? nf : 1.f;
|
||||
|
||||
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
|
||||
{
|
||||
ofs = f->ofs[2];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
|
||||
int ntrees = stages[stageIdx].ntrees;
|
||||
float s = 0.f;
|
||||
#if NODE_COUNT==1
|
||||
for( i = 0; i < ntrees; i++ )
|
||||
{
|
||||
float4 st = stump[i].st;
|
||||
__global const OptHaarFeature* f = optfeatures + as_int(st.x);
|
||||
float4 weight = f->weight;
|
||||
|
||||
int4 ofs = f->ofs[0];
|
||||
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
|
||||
ofs = f->ofs[1];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
|
||||
if( weight.z > 0 )
|
||||
{
|
||||
ofs = f->ofs[2];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
|
||||
}
|
||||
|
||||
s += (sval < st.y*nf) ? st.z : st.w;
|
||||
}
|
||||
stump += ntrees;
|
||||
#else
|
||||
for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
|
||||
{
|
||||
int idx = 0;
|
||||
do
|
||||
{
|
||||
int4 n = node[idx].n;
|
||||
__global const OptHaarFeature* f = optfeatures + n.x;
|
||||
float4 weight = f->weight;
|
||||
|
||||
int4 ofs = f->ofs[0];
|
||||
|
||||
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
|
||||
ofs = f->ofs[1];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
|
||||
if( weight.z > 0 )
|
||||
{
|
||||
ofs = f->ofs[2];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
|
||||
}
|
||||
|
||||
idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
|
||||
}
|
||||
while(idx > 0);
|
||||
s += leaves[-idx];
|
||||
}
|
||||
#endif
|
||||
|
||||
if( s < stages[stageIdx].threshold )
|
||||
break;
|
||||
}
|
||||
|
||||
s += (sval < st.y*nf) ? st.z : st.w;
|
||||
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
|
||||
{
|
||||
int count = atomic_inc(lcount);
|
||||
lbuf[count] = (int)(ix | (iy << 8));
|
||||
lnf[count] = nf;
|
||||
}
|
||||
}
|
||||
|
||||
if( s < stages[stageIdx].threshold )
|
||||
break;
|
||||
}
|
||||
|
||||
if( stageIdx == nstages )
|
||||
{
|
||||
int nfaces = atomic_inc(facepos);
|
||||
if( nfaces < maxFaces )
|
||||
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
|
||||
{
|
||||
volatile __global int* face = facepos + 1 + nfaces*4;
|
||||
face[0] = convert_int_rte(ix*factor);
|
||||
face[1] = convert_int_rte(iy*factor);
|
||||
face[2] = convert_int_rte(windowsize.x*factor);
|
||||
face[3] = convert_int_rte(windowsize.y*factor);
|
||||
int nrects = lcount[0];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if( nrects == 0 )
|
||||
break;
|
||||
if( lidx == 0 )
|
||||
lcount[0] = 0;
|
||||
|
||||
{
|
||||
#if NODE_COUNT == 1
|
||||
__global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;
|
||||
#else
|
||||
__global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;
|
||||
__global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);
|
||||
#endif
|
||||
int nparts = LOCAL_SIZE / nrects;
|
||||
int ntrees = stages[stageIdx].ntrees;
|
||||
int ntrees_p = (ntrees + nparts - 1)/nparts;
|
||||
int nr = lidx / nparts;
|
||||
int partidx = -1, idxval = 0;
|
||||
float partsum = 0.f, nf = 0.f;
|
||||
|
||||
if( nr < nrects )
|
||||
{
|
||||
partidx = lidx % nparts;
|
||||
idxval = lbuf[nr];
|
||||
nf = lnf[nr];
|
||||
|
||||
{
|
||||
int ntrees0 = ntrees_p*partidx;
|
||||
int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
|
||||
int ix1 = idxval & 255, iy1 = idxval >> 8;
|
||||
#if SUM_BUF_SIZE > 0
|
||||
__local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
|
||||
#else
|
||||
__global const int* psum = psum0 + mad24(iy1, sumstep, ix1);
|
||||
#endif
|
||||
|
||||
#if NODE_COUNT == 1
|
||||
for( i = ntrees0; i < ntrees1; i++ )
|
||||
{
|
||||
float4 st = stump[i].st;
|
||||
__global const OptHaarFeature* f = optfeatures + as_int(st.x);
|
||||
float4 weight = f->weight;
|
||||
|
||||
int4 ofs = f->ofs[0];
|
||||
float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
|
||||
ofs = f->ofs[1];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
|
||||
//if( weight.z > 0 )
|
||||
{
|
||||
ofs = f->ofs[2];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
|
||||
}
|
||||
|
||||
partsum += (sval < st.y*nf) ? st.z : st.w;
|
||||
}
|
||||
#else
|
||||
for( i = ntrees0; i < ntrees1; i++ )
|
||||
{
|
||||
int idx = 0;
|
||||
do
|
||||
{
|
||||
int4 n = node[i*2 + idx].n;
|
||||
__global const OptHaarFeature* f = optfeatures + n.x;
|
||||
float4 weight = f->weight;
|
||||
int4 ofs = f->ofs[0];
|
||||
|
||||
float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
|
||||
ofs = f->ofs[1];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
|
||||
if( weight.z > 0 )
|
||||
{
|
||||
ofs = f->ofs[2];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
|
||||
}
|
||||
|
||||
idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
|
||||
}
|
||||
while(idx > 0);
|
||||
partsum += leaves[i*3-idx];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
lpartsum[lidx] = partsum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if( partidx == 0 )
|
||||
{
|
||||
float s = lpartsum[nr*nparts];
|
||||
for( i = 1; i < nparts; i++ )
|
||||
s += lpartsum[i + nr*nparts];
|
||||
if( s >= stages[stageIdx].threshold )
|
||||
{
|
||||
int count = atomic_inc(lcount);
|
||||
lbuf[count] = idxval;
|
||||
lnf[count] = nf;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if( stageIdx == nstages )
|
||||
{
|
||||
int nrects = lcount[0];
|
||||
if( lidx < nrects )
|
||||
{
|
||||
int nfaces = atomic_inc(facepos);
|
||||
if( nfaces < maxFaces )
|
||||
{
|
||||
volatile __global int* face = facepos + 1 + nfaces*3;
|
||||
int val = lbuf[lidx];
|
||||
face[0] = scaleIdx;
|
||||
face[1] = ix0 + (val & 255);
|
||||
face[2] = iy0 + (val >> 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef CALC_SUM_OFS_
|
||||
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
|
||||
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
|
||||
|
||||
__kernel void runLBPClassifierStump(
|
||||
__kernel void runLBPClassifierStumpSimple(
|
||||
int nscales, __global const ScaleData* scaleData,
|
||||
__global const int* sum,
|
||||
int sumstep, int sumoffset,
|
||||
int _sumstep, int sumoffset,
|
||||
__global const OptLBPFeature* optfeatures,
|
||||
|
||||
int nstages,
|
||||
int splitstage, int nstages,
|
||||
__global const Stage* stages,
|
||||
__global const Stump* stumps,
|
||||
__global const int* bitsets,
|
||||
int bitsetSize,
|
||||
|
||||
volatile __global int* facepos,
|
||||
int2 imgsize, int xyscale, float factor,
|
||||
int2 windowsize, int maxFaces)
|
||||
{
|
||||
int ix = get_global_id(0)*xyscale;
|
||||
int iy = get_global_id(1)*xyscale;
|
||||
sumstep /= sizeof(int);
|
||||
int lx = get_local_id(0);
|
||||
int ly = get_local_id(1);
|
||||
int local_size_x = get_local_size(0);
|
||||
int local_size_y = get_local_size(1);
|
||||
int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
|
||||
int ngroups = get_num_groups(0)*get_num_groups(1);
|
||||
int scaleIdx, tileIdx, stageIdx;
|
||||
int startStage = 0, endStage = nstages;
|
||||
int sumstep = (int)(_sumstep/sizeof(int));
|
||||
|
||||
if( ix < imgsize.x && iy < imgsize.y )
|
||||
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
|
||||
{
|
||||
int stageIdx;
|
||||
__global const Stump* stump = stumps;
|
||||
__global const int* p = sum + mad24(iy, sumstep, ix);
|
||||
__global const ScaleData* s = scaleData + scaleIdx;
|
||||
int ystep = s->ystep;
|
||||
int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
|
||||
int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,
|
||||
(worksize.y/ystep + local_size_y-1)/local_size_y);
|
||||
int totalTiles = ntiles.x*ntiles.y;
|
||||
|
||||
for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
|
||||
for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
|
||||
{
|
||||
int i, ntrees = stages[stageIdx].ntrees;
|
||||
float s = 0.f;
|
||||
for( i = 0; i < ntrees; i++, stump++, bitsets += bitsetSize )
|
||||
int iy = ((tileIdx / ntiles.x)*local_size_y + ly)*ystep;
|
||||
int ix = ((tileIdx % ntiles.x)*local_size_x + lx)*ystep;
|
||||
|
||||
if( ix < worksize.x && iy < worksize.y )
|
||||
{
|
||||
float4 st = stump->st;
|
||||
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
|
||||
int16 ofs = f->ofs;
|
||||
__global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;
|
||||
__global const Stump* stump = stumps;
|
||||
__global const int* bitset = bitsets;
|
||||
|
||||
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
|
||||
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
|
||||
for( stageIdx = 0; stageIdx < endStage; stageIdx++ )
|
||||
{
|
||||
int i, ntrees = stages[stageIdx].ntrees;
|
||||
float s = 0.f;
|
||||
for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
|
||||
{
|
||||
float4 st = stump->st;
|
||||
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
|
||||
int16 ofs = f->ofs;
|
||||
|
||||
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
|
||||
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
|
||||
|
||||
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
|
||||
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
|
||||
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
|
||||
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
|
||||
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
|
||||
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
|
||||
|
||||
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
|
||||
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
|
||||
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
|
||||
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
|
||||
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
|
||||
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
|
||||
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
|
||||
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
|
||||
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
|
||||
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
|
||||
|
||||
s += (bitsets[idx] & (1 << mask)) ? st.z : st.w;
|
||||
}
|
||||
s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
|
||||
}
|
||||
|
||||
if( s < stages[stageIdx].threshold )
|
||||
break;
|
||||
}
|
||||
if( s < stages[stageIdx].threshold )
|
||||
break;
|
||||
}
|
||||
|
||||
if( stageIdx == nstages )
|
||||
{
|
||||
int nfaces = atomic_inc(facepos);
|
||||
if( nfaces < maxFaces )
|
||||
{
|
||||
volatile __global int* face = facepos + 1 + nfaces*4;
|
||||
face[0] = convert_int_rte(ix*factor);
|
||||
face[1] = convert_int_rte(iy*factor);
|
||||
face[2] = convert_int_rte(windowsize.x*factor);
|
||||
face[3] = convert_int_rte(windowsize.y*factor);
|
||||
if( stageIdx == nstages )
|
||||
{
|
||||
int nfaces = atomic_inc(facepos);
|
||||
if( nfaces < maxFaces )
|
||||
{
|
||||
volatile __global int* face = facepos + 1 + nfaces*3;
|
||||
face[0] = scaleIdx;
|
||||
face[1] = ix;
|
||||
face[2] = iy;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
|
||||
void runLBPClassifierStump(
|
||||
int nscales, __global const ScaleData* scaleData,
|
||||
__global const int* sum,
|
||||
int _sumstep, int sumoffset,
|
||||
__global const OptLBPFeature* optfeatures,
|
||||
|
||||
int splitstage, int nstages,
|
||||
__global const Stage* stages,
|
||||
__global const Stump* stumps,
|
||||
__global const int* bitsets,
|
||||
int bitsetSize,
|
||||
|
||||
volatile __global int* facepos,
|
||||
int2 windowsize, int maxFaces)
|
||||
{
|
||||
int lx = get_local_id(0);
|
||||
int ly = get_local_id(1);
|
||||
int groupIdx = get_group_id(0);
|
||||
int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
|
||||
int scaleIdx, tileIdx, stageIdx;
|
||||
int sumstep = (int)(_sumstep/sizeof(int));
|
||||
int lidx = ly*LOCAL_SIZE_X + lx;
|
||||
|
||||
#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
|
||||
__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];
|
||||
#if SUM_BUF_SIZE > 0
|
||||
__local int* ibuf = lstore;
|
||||
__local int* lcount = ibuf + SUM_BUF_SIZE;
|
||||
#else
|
||||
__local int* lcount = lstore;
|
||||
#endif
|
||||
__local float* lpartsum = (__local float*)(lcount + 1);
|
||||
__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
|
||||
|
||||
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
|
||||
{
|
||||
__global const ScaleData* s = scaleData + scaleIdx;
|
||||
int ystep = s->ystep;
|
||||
int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
|
||||
int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
|
||||
(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
|
||||
int totalTiles = ntiles.x*ntiles.y;
|
||||
|
||||
for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
|
||||
{
|
||||
int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
|
||||
int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
|
||||
int ix = lx, iy = ly;
|
||||
__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
|
||||
|
||||
if( ix0 >= worksize.x || iy0 >= worksize.y )
|
||||
continue;
|
||||
#if SUM_BUF_SIZE > 0
|
||||
for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
|
||||
{
|
||||
int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
|
||||
vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
|
||||
if( lidx == 0 )
|
||||
lcount[0] = 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
|
||||
{
|
||||
__global const Stump* stump = stumps;
|
||||
__global const int* bitset = bitsets;
|
||||
#if SUM_BUF_SIZE > 0
|
||||
__local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);
|
||||
#else
|
||||
__global const int* p = psum0 + mad24(iy, sumstep, ix);
|
||||
#endif
|
||||
|
||||
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
|
||||
{
|
||||
int ntrees = stages[stageIdx].ntrees;
|
||||
float s = 0.f;
|
||||
for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
|
||||
{
|
||||
float4 st = stump->st;
|
||||
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
|
||||
int16 ofs = f->ofs;
|
||||
|
||||
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
|
||||
|
||||
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
|
||||
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
|
||||
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
|
||||
|
||||
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
|
||||
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
|
||||
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
|
||||
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
|
||||
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
|
||||
|
||||
s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
|
||||
}
|
||||
|
||||
if( s < stages[stageIdx].threshold )
|
||||
break;
|
||||
}
|
||||
|
||||
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
|
||||
{
|
||||
int count = atomic_inc(lcount);
|
||||
lbuf[count] = (int)(ix | (iy << 8));
|
||||
}
|
||||
}
|
||||
|
||||
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
|
||||
{
|
||||
int nrects = lcount[0];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if( nrects == 0 )
|
||||
break;
|
||||
if( lidx == 0 )
|
||||
lcount[0] = 0;
|
||||
|
||||
{
|
||||
__global const Stump* stump = stumps + stages[stageIdx].first;
|
||||
__global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;
|
||||
int nparts = LOCAL_SIZE / nrects;
|
||||
int ntrees = stages[stageIdx].ntrees;
|
||||
int ntrees_p = (ntrees + nparts - 1)/nparts;
|
||||
int nr = lidx / nparts;
|
||||
int partidx = -1, idxval = 0;
|
||||
float partsum = 0.f, nf = 0.f;
|
||||
|
||||
if( nr < nrects )
|
||||
{
|
||||
partidx = lidx % nparts;
|
||||
idxval = lbuf[nr];
|
||||
|
||||
{
|
||||
int ntrees0 = ntrees_p*partidx;
|
||||
int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
|
||||
int ix1 = idxval & 255, iy1 = idxval >> 8;
|
||||
#if SUM_BUF_SIZE > 0
|
||||
__local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
|
||||
#else
|
||||
__global const int* p = psum0 + mad24(iy1, sumstep, ix1);
|
||||
#endif
|
||||
|
||||
for( i = ntrees0; i < ntrees1; i++ )
|
||||
{
|
||||
float4 st = stump[i].st;
|
||||
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
|
||||
int16 ofs = f->ofs;
|
||||
|
||||
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
|
||||
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
|
||||
|
||||
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
|
||||
|
||||
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
|
||||
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
|
||||
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
|
||||
|
||||
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
|
||||
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
|
||||
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
|
||||
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
|
||||
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
|
||||
|
||||
partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;
|
||||
}
|
||||
}
|
||||
}
|
||||
lpartsum[lidx] = partsum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if( partidx == 0 )
|
||||
{
|
||||
float s = lpartsum[nr*nparts];
|
||||
for( i = 1; i < nparts; i++ )
|
||||
s += lpartsum[i + nr*nparts];
|
||||
if( s >= stages[stageIdx].threshold )
|
||||
{
|
||||
int count = atomic_inc(lcount);
|
||||
lbuf[count] = idxval;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if( stageIdx == nstages )
|
||||
{
|
||||
int nrects = lcount[0];
|
||||
if( lidx < nrects )
|
||||
{
|
||||
int nfaces = atomic_inc(facepos);
|
||||
if( nfaces < maxFaces )
|
||||
{
|
||||
volatile __global int* face = facepos + 1 + nfaces*3;
|
||||
int val = lbuf[lidx];
|
||||
face[0] = scaleIdx;
|
||||
face[1] = ix0 + (val & 255);
|
||||
face[2] = iy0 + (val >> 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user