started adding OpenCL acceleration of LBP-based object detectors
This commit is contained in:
@@ -1,19 +1,22 @@
|
||||
///////////////////////////// OpenCL kernels for face detection //////////////////////////////
|
||||
////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
|
||||
|
||||
typedef struct __attribute__((aligned(4))) OptFeature
|
||||
typedef struct __attribute__((aligned(4))) OptHaarFeature
|
||||
{
|
||||
int4 ofs[3] __attribute__((aligned (4)));
|
||||
float4 weight __attribute__((aligned (4)));
|
||||
}
|
||||
OptFeature;
|
||||
OptHaarFeature;
|
||||
|
||||
typedef struct __attribute__((aligned(4))) OptLBPFeature
|
||||
{
|
||||
int16 ofs __attribute__((aligned (4)));
|
||||
}
|
||||
OptLBPFeature;
|
||||
|
||||
typedef struct __attribute__((aligned(4))) Stump
|
||||
{
|
||||
int featureIdx __attribute__((aligned (4)));
|
||||
float threshold __attribute__((aligned (4))); // for ordered features only
|
||||
float left __attribute__((aligned (4)));
|
||||
float right __attribute__((aligned (4)));
|
||||
float4 st __attribute__((aligned (4)));
|
||||
}
|
||||
Stump;
|
||||
|
||||
@@ -30,7 +33,7 @@ __kernel void runHaarClassifierStump(
|
||||
int sumstep, int sumoffset,
|
||||
__global const int* sqsum,
|
||||
int sqsumstep, int sqsumoffset,
|
||||
__global const OptFeature* optfeatures,
|
||||
__global const OptHaarFeature* optfeatures,
|
||||
|
||||
int nstages,
|
||||
__global const Stage* stages,
|
||||
@@ -47,11 +50,8 @@ __kernel void runHaarClassifierStump(
|
||||
|
||||
if( ix < imgsize.x && iy < imgsize.y )
|
||||
{
|
||||
int ntrees;
|
||||
int stageIdx, i;
|
||||
float s = 0.f;
|
||||
int stageIdx;
|
||||
__global const Stump* stump = stumps;
|
||||
__global const OptFeature* f;
|
||||
|
||||
__global const int* psum = sum + mad24(iy, sumstep, ix);
|
||||
__global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
|
||||
@@ -61,20 +61,19 @@ __kernel void runHaarClassifierStump(
|
||||
pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
|
||||
float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
|
||||
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
|
||||
float4 weight, vsval;
|
||||
int4 ofs, ofs0, ofs1, ofs2;
|
||||
nf = nf > 0 ? nf : 1.f;
|
||||
|
||||
for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
|
||||
{
|
||||
ntrees = stages[stageIdx].ntrees;
|
||||
s = 0.f;
|
||||
int i, ntrees = stages[stageIdx].ntrees;
|
||||
float s = 0.f;
|
||||
for( i = 0; i < ntrees; i++, stump++ )
|
||||
{
|
||||
f = optfeatures + stump->featureIdx;
|
||||
weight = f->weight;
|
||||
float4 st = stump->st;
|
||||
__global const OptHaarFeature* f = optfeatures + as_int(st.x);
|
||||
float4 weight = f->weight;
|
||||
|
||||
ofs = f->ofs[0];
|
||||
int4 ofs = f->ofs[0];
|
||||
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
|
||||
ofs = f->ofs[1];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
|
||||
@@ -84,7 +83,7 @@ __kernel void runHaarClassifierStump(
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
|
||||
}
|
||||
|
||||
s += (sval < stump->threshold*nf) ? stump->left : stump->right;
|
||||
s += (sval < st.y*nf) ? st.z : st.w;
|
||||
}
|
||||
|
||||
if( s < stages[stageIdx].threshold )
|
||||
@@ -110,9 +109,7 @@ __kernel void runHaarClassifierStump(
|
||||
__kernel void runLBPClassifierStump(
|
||||
__global const int* sum,
|
||||
int sumstep, int sumoffset,
|
||||
__global const int* sqsum,
|
||||
int sqsumstep, int sqsumoffset,
|
||||
__global const OptFeature* optfeatures,
|
||||
__global const OptLBPFeature* optfeatures,
|
||||
|
||||
int nstages,
|
||||
__global const Stage* stages,
|
||||
@@ -124,50 +121,45 @@ __kernel void runLBPClassifierStump(
|
||||
int2 imgsize, int xyscale, float factor,
|
||||
int4 normrect, int2 windowsize, int maxFaces)
|
||||
{
|
||||
int ix = get_global_id(0)*xyscale*VECTOR_SIZE;
|
||||
int ix = get_global_id(0)*xyscale;
|
||||
int iy = get_global_id(1)*xyscale;
|
||||
sumstep /= sizeof(int);
|
||||
sqsumstep /= sizeof(int);
|
||||
|
||||
|
||||
if( ix < imgsize.x && iy < imgsize.y )
|
||||
{
|
||||
int ntrees;
|
||||
int stageIdx, i;
|
||||
float s = 0.f;
|
||||
int stageIdx;
|
||||
__global const Stump* stump = stumps;
|
||||
__global const int* bitset = bitsets;
|
||||
__global const OptFeature* f;
|
||||
|
||||
__global const int* psum = sum + mad24(iy, sumstep, ix);
|
||||
__global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
|
||||
int normarea = normrect.z * normrect.w;
|
||||
float invarea = 1.f/normarea;
|
||||
float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
|
||||
pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
|
||||
float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
|
||||
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
|
||||
float4 weight;
|
||||
int4 ofs;
|
||||
nf = nf > 0 ? nf : 1.f;
|
||||
|
||||
|
||||
for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
|
||||
{
|
||||
ntrees = stages[stageIdx].ntrees;
|
||||
s = 0.f;
|
||||
for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
|
||||
int i, ntrees = stages[stageIdx].ntrees;
|
||||
float s = 0.f;
|
||||
for( i = 0; i < ntrees; i++, stump++ )
|
||||
{
|
||||
f = optfeatures + stump->featureIdx;
|
||||
|
||||
weight = f->weight;
|
||||
|
||||
// compute LBP feature to val
|
||||
s += (bitset[val >> 5] & (1 << (val & 31))) ? stump->left : stump->right;
|
||||
float4 st = stump->st;
|
||||
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
|
||||
int16 ofs = f->ofs;
|
||||
|
||||
|
||||
|
||||
int4 ofs = f->ofs[0];
|
||||
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
|
||||
ofs = f->ofs[1];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
|
||||
if( weight.z > 0 )
|
||||
{
|
||||
ofs = f->ofs[2];
|
||||
sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
|
||||
}
|
||||
|
||||
s += (sval < st.y*nf) ? st.z : st.w;
|
||||
}
|
||||
|
||||
|
||||
if( s < stages[stageIdx].threshold )
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
if( stageIdx == nstages )
|
||||
{
|
||||
int nfaces = atomic_inc(facepos);
|
||||
|
Reference in New Issue
Block a user