From efa84d82251db6c8ef4c38daac683e7d32ff205a Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 16 Dec 2014 16:21:05 +0800 Subject: [PATCH] Use preprocessor for constant values in OpenCL kernel instead of the parameter variable. It could improve the performance of OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/*. Especially, OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/15 OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/16 could be improved about 2% in Intel platform. Signed-off-by: Yan Wang --- modules/objdetect/src/cascadedetect.cpp | 24 ++++++------ modules/objdetect/src/opencl/cascadedetect.cl | 39 ++++++++----------- 2 files changed, 27 insertions(+), 36 deletions(-) diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp index 07c848eb9..4e25a5ccf 100644 --- a/modules/objdetect/src/cascadedetect.cpp +++ b/modules/objdetect/src/cascadedetect.cpp @@ -1060,6 +1060,7 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vectorgetNormRect(); int sqofs = haar->getSquaresOffset(); - int splitstage_ocl = 1; haarKernel.args((int)scales.size(), ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData @@ -1091,13 +1091,12 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector 0 ? nf : 1.f; - for( stageIdx = 0; stageIdx < splitstage; stageIdx++ ) + for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ ) { int ntrees = stages[stageIdx].ntrees; float s = 0.f; @@ -221,7 +219,7 @@ void runHaarClassifier( break; } - if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) ) + if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) ) { int count = atomic_inc(lcount); lbuf[count] = (int)(ix | (iy << 8)); @@ -229,7 +227,7 @@ void runHaarClassifier( } } - for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ ) + for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ ) { int nrects = lcount[0]; @@ -335,13 +333,13 @@ void runHaarClassifier( } barrier(CLK_LOCAL_MEM_FENCE); - if( stageIdx == nstages ) + if( stageIdx == N_STAGES ) { int nrects = lcount[0]; if( lidx < nrects ) { int nfaces = atomic_inc(facepos); - if( nfaces < maxFaces ) + if( nfaces < MAX_FACES ) { volatile __global int* face = facepos + 1 + nfaces*3; int val = lbuf[lidx]; @@ -364,15 +362,13 @@ __kernel void runLBPClassifierStumpSimple( __global const int* sum, int _sumstep, int sumoffset, __global const OptLBPFeature* optfeatures, - - int splitstage, int nstages, __global const Stage* stages, __global const Stump* stumps, __global const int* bitsets, int bitsetSize, volatile __global int* facepos, - int2 windowsize, int maxFaces) + int2 windowsize) { int lx = get_local_id(0); int ly = get_local_id(1); @@ -381,7 +377,6 @@ __kernel void runLBPClassifierStumpSimple( int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0); int ngroups = get_num_groups(0)*get_num_groups(1); int scaleIdx, tileIdx, stageIdx; - int startStage = 0, endStage = nstages; int sumstep = (int)(_sumstep/sizeof(int)); for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- ) @@ -404,7 +399,7 @@ __kernel void runLBPClassifierStumpSimple( __global const Stump* stump = stumps; __global const int* bitset = bitsets; - for( stageIdx = 0; stageIdx < endStage; stageIdx++ ) + for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ ) { int i, ntrees = stages[stageIdx].ntrees; float s = 0.f; @@ -433,10 +428,10 @@ __kernel void runLBPClassifierStumpSimple( break; } - if( stageIdx == nstages ) + if( stageIdx == N_STAGES ) { int nfaces = atomic_inc(facepos); - if( nfaces < maxFaces ) + if( nfaces < MAX_FACES ) { volatile __global int* face = facepos + 1 + nfaces*3; face[0] = scaleIdx; @@ -455,15 +450,13 @@ void runLBPClassifierStump( __global const int* sum, int _sumstep, int sumoffset, __global const OptLBPFeature* optfeatures, - - int splitstage, int nstages, __global const Stage* stages, __global const Stump* stumps, __global const int* bitsets, int bitsetSize, volatile __global int* facepos, - int2 windowsize, int maxFaces) + int2 windowsize) { int lx = get_local_id(0); int ly = get_local_id(1); @@ -525,7 +518,7 @@ void runLBPClassifierStump( __global const int* p = psum0 + mad24(iy, sumstep, ix); #endif - for( stageIdx = 0; stageIdx < splitstage; stageIdx++ ) + for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ ) { int ntrees = stages[stageIdx].ntrees; float s = 0.f; @@ -554,14 +547,14 @@ void runLBPClassifierStump( break; } - if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) ) + if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) ) { int count = atomic_inc(lcount); lbuf[count] = (int)(ix | (iy << 8)); } } - for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ ) + for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ ) { int nrects = lcount[0]; @@ -639,13 +632,13 @@ void runLBPClassifierStump( } barrier(CLK_LOCAL_MEM_FENCE); - if( stageIdx == nstages ) + if( stageIdx == N_STAGES ) { int nrects = lcount[0]; if( lidx < nrects ) { int nfaces = atomic_inc(facepos); - if( nfaces < maxFaces ) + if( nfaces < MAX_FACES ) { volatile __global int* face = facepos + 1 + nfaces*3; int val = lbuf[lidx];