Made small optimization: for some processors using float accumulator gives speedup.

This commit is contained in:
Leonid Beynenson 2011-05-31 13:44:13 +00:00
parent 6aabf72bc5
commit 1299941622

View File

@ -982,24 +982,47 @@ inline int predictCategoricalStump( CascadeClassifier& cascade, Ptr<FeatureEvalu
CascadeClassifier::Data::DTreeNode* cascadeNodes = &cascade.data.nodes[0]; CascadeClassifier::Data::DTreeNode* cascadeNodes = &cascade.data.nodes[0];
CascadeClassifier::Data::Stage* cascadeStages = &cascade.data.stages[0]; CascadeClassifier::Data::Stage* cascadeStages = &cascade.data.stages[0];
#ifdef HAVE_TEGRA_OPTIMIZATION
float tmp; // float accumulator -- float operations are quicker
#endif
for( int si = 0; si < nstages; si++ ) for( int si = 0; si < nstages; si++ )
{ {
CascadeClassifier::Data::Stage& stage = cascadeStages[si]; CascadeClassifier::Data::Stage& stage = cascadeStages[si];
int wi, ntrees = stage.ntrees; int wi, ntrees = stage.ntrees;
#ifdef HAVE_TEGRA_OPTIMIZATION
tmp = 0;
#else
sum = 0; sum = 0;
#endif
for( wi = 0; wi < ntrees; wi++ ) for( wi = 0; wi < ntrees; wi++ )
{ {
CascadeClassifier::Data::DTreeNode& node = cascadeNodes[nodeOfs]; CascadeClassifier::Data::DTreeNode& node = cascadeNodes[nodeOfs];
int c = featureEvaluator(node.featureIdx); int c = featureEvaluator(node.featureIdx);
const int* subset = &cascadeSubsets[nodeOfs*subsetSize]; const int* subset = &cascadeSubsets[nodeOfs*subsetSize];
#ifdef HAVE_TEGRA_OPTIMIZATION
tmp += cascadeLeaves[ subset[c>>5] & (1 << (c & 31)) ? leafOfs : leafOfs+1];
#else
sum += cascadeLeaves[ subset[c>>5] & (1 << (c & 31)) ? leafOfs : leafOfs+1]; sum += cascadeLeaves[ subset[c>>5] & (1 << (c & 31)) ? leafOfs : leafOfs+1];
#endif
nodeOfs++; nodeOfs++;
leafOfs += 2; leafOfs += 2;
} }
#ifdef HAVE_TEGRA_OPTIMIZATION
if( tmp < stage.threshold ) {
sum = (double)tmp;
return -si;
}
#else
if( sum < stage.threshold ) if( sum < stage.threshold )
return -si; return -si;
#endif
} }
#ifdef HAVE_TEGRA_OPTIMIZATION
sum = (double)tmp;
#endif
return 1; return 1;
} }