From 3dcddad88aa13b729313939648c29f420a9f8054 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Thu, 5 Dec 2013 13:52:26 +0400 Subject: [PATCH] ocl: added workaround into Haar kernels --- modules/ocl/src/opencl/haarobjectdetect.cl | 86 ++++++++------- .../src/opencl/haarobjectdetect_scaled2.cl | 101 +++++++++--------- 2 files changed, 100 insertions(+), 87 deletions(-) diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index a62b3af8c..980e85dd2 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -62,13 +62,13 @@ typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode GpuHidHaarTreeNode; -typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier -{ - int count __attribute__((aligned (4))); - GpuHidHaarTreeNode* node __attribute__((aligned (8))); - float* alpha __attribute__((aligned (8))); -} -GpuHidHaarClassifier; +//typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier +//{ +// int count __attribute__((aligned (4))); +// GpuHidHaarTreeNode* node __attribute__((aligned (8))); +// float* alpha __attribute__((aligned (8))); +//} +//GpuHidHaarClassifier; typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier @@ -84,22 +84,22 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier GpuHidHaarStageClassifier; -typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade -{ - int count __attribute__((aligned (4))); - int is_stump_based __attribute__((aligned (4))); - int has_tilted_features __attribute__((aligned (4))); - int is_tree __attribute__((aligned (4))); - int pq0 __attribute__((aligned (4))); - int pq1 __attribute__((aligned (4))); - int pq2 __attribute__((aligned (4))); - int pq3 __attribute__((aligned (4))); - int p0 __attribute__((aligned (4))); - int p1 __attribute__((aligned (4))); - int p2 __attribute__((aligned (4))); - int p3 __attribute__((aligned (4))); - float inv_window_area __attribute__((aligned (4))); -} GpuHidHaarClassifierCascade; +//typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade +//{ +// int count __attribute__((aligned (4))); +// int is_stump_based __attribute__((aligned (4))); +// int has_tilted_features __attribute__((aligned (4))); +// int is_tree __attribute__((aligned (4))); +// int pq0 __attribute__((aligned (4))); +// int pq1 __attribute__((aligned (4))); +// int pq2 __attribute__((aligned (4))); +// int pq3 __attribute__((aligned (4))); +// int p0 __attribute__((aligned (4))); +// int p1 __attribute__((aligned (4))); +// int p2 __attribute__((aligned (4))); +// int p3 __attribute__((aligned (4))); +// float inv_window_area __attribute__((aligned (4))); +//} GpuHidHaarClassifierCascade; #ifdef PACKED_CLASSIFIER @@ -196,10 +196,12 @@ __kernel void gpuRunHaarClassifierCascadePacked( for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ ) {// iterate until candidate is exist float stage_sum = 0.0f; - int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); - float stagethreshold = as_float(stageinfo.y); + __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) + ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); + int stagecount = stageinfo->count; + float stagethreshold = stageinfo->threshold; int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x); - for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ ) + for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ ) { // simple macro to extract shorts from int #define M0(_t) ((_t)&0xFFFF) @@ -355,14 +357,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa variance_norm_factor = variance_norm_factor * correction - mean * mean; variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f; - for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) + for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) { float stage_sum = 0.f; - int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); - float stagethreshold = as_float(stageinfo.y); - for(int nodeloop = 0; nodeloop < stageinfo.x; ) + __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) + ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); + int stagecount = stageinfo->count; + float stagethreshold = stageinfo->threshold; + for(int nodeloop = 0; nodeloop < stagecount; ) { - __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); + __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*) + (((__global uchar*)nodeptr) + nodecounter * sizeof(GpuHidHaarTreeNode)); int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); @@ -418,7 +423,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa #endif } - result = (stage_sum >= stagethreshold); + result = (stage_sum >= stagethreshold) ? 1 : 0; } if(factor < 2) { @@ -447,14 +452,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa lclcount[0]=0; barrier(CLK_LOCAL_MEM_FENCE); - int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); - float stagethreshold = as_float(stageinfo.y); + //int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) + ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); + int stagecount = stageinfo->count; + float stagethreshold = stageinfo->threshold; int perfscale = queuecount > 4 ? 3 : 2; int queuecount_loop = (queuecount + (1<> perfscale; int lcl_compute_win = lcl_sz >> perfscale; int lcl_compute_win_id = (lcl_id >>(6-perfscale)); - int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); + int lcl_loops = (stagecount + lcl_compute_win -1) >> (6-perfscale); int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); for(int queueloop=0; queueloopp[0][0])); int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); @@ -549,7 +557,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa queuecount = lclcount[0]; barrier(CLK_LOCAL_MEM_FENCE); - nodecounter += stageinfo.x; + nodecounter += stagecount; }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++) if(lcl_id> 16; int totalgrp = scaleinfo1.y & 0xffff; float factor = as_float(scaleinfo1.w); @@ -174,15 +173,18 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++) { float stage_sum = 0.f; - int stagecount = stagecascadeptr[stageloop].count; + __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) + (((__global uchar*)stagecascadeptr_)+stageloop*sizeof(GpuHidHaarStageClassifier)); + int stagecount = stageinfo->count; for (int nodeloop = 0; nodeloop < stagecount;) { - __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); + __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*) + (((__global uchar*)nodeptr_) + nodecounter * sizeof(GpuHidHaarTreeNode)); int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); - float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0])); + float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0])); float nodethreshold = w.w * variance_norm_factor; info1.x += p_offset; @@ -204,7 +206,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z; - bool passThres = classsum >= nodethreshold; + bool passThres = (classsum >= nodethreshold) ? 1 : 0; #if STUMP_BASED stage_sum += passThres ? alpha3.y : alpha3.x; @@ -234,7 +236,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( } #endif } - result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold); + + result = (stage_sum >= stageinfo->threshold) ? 1 : 0; } barrier(CLK_LOCAL_MEM_FENCE); @@ -281,11 +284,14 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( } } } -__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum) +__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, const int nodenum) { - int counter = get_global_id(0); + const int counter = get_global_id(0); int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; - GpuHidHaarTreeNode t1 = *(orinode + counter); + GpuHidHaarTreeNode t1 = *(__global GpuHidHaarTreeNode*) + (((__global uchar*)orinode) + counter * sizeof(GpuHidHaarTreeNode)); + __global GpuHidHaarTreeNode* pNew = (__global GpuHidHaarTreeNode*) + (((__global uchar*)newnode) + (counter + nodenum) * sizeof(GpuHidHaarTreeNode)); #pragma unroll for (i = 0; i < 3; i++) @@ -297,22 +303,21 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH } t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]); - counter += nodenum; #pragma unroll for (i = 0; i < 3; i++) { - newnode[counter].p[i][0] = tr_x[i]; - newnode[counter].p[i][1] = tr_y[i]; - newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; - newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; - newnode[counter].weight[i] = t1.weight[i] * weight_scale; + pNew->p[i][0] = tr_x[i]; + pNew->p[i][1] = tr_y[i]; + pNew->p[i][2] = tr_x[i] + tr_w[i]; + pNew->p[i][3] = tr_y[i] + tr_h[i]; + pNew->weight[i] = t1.weight[i] * weight_scale; } - newnode[counter].left = t1.left; - newnode[counter].right = t1.right; - newnode[counter].threshold = t1.threshold; - newnode[counter].alpha[0] = t1.alpha[0]; - newnode[counter].alpha[1] = t1.alpha[1]; - newnode[counter].alpha[2] = t1.alpha[2]; + pNew->left = t1.left; + pNew->right = t1.right; + pNew->threshold = t1.threshold; + pNew->alpha[0] = t1.alpha[0]; + pNew->alpha[1] = t1.alpha[1]; + pNew->alpha[2] = t1.alpha[2]; }