Merge pull request #1210 from pengx17:2.4_ocl_surf_intel_fix

This commit is contained in:
Roman Donchenko 2013-08-16 12:22:18 +04:00 committed by OpenCV Buildbot
commit ac8506db32

View File

@ -16,6 +16,7 @@
// //
// @Authors // @Authors
// Peng Xiao, pengxiao@multicorewareinc.com // Peng Xiao, pengxiao@multicorewareinc.com
// Sen Liu, swjtuls1987@126.com
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met: // are permitted provided that the following conditions are met:
@ -43,9 +44,6 @@
// //
//M*/ //M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
// specialized for non-image2d_t supported platform, intel HD4000, for example // specialized for non-image2d_t supported platform, intel HD4000, for example
#ifdef DISABLE_IMAGE2D #ifdef DISABLE_IMAGE2D
#define IMAGE_INT32 __global uint * #define IMAGE_INT32 __global uint *
@ -105,7 +103,7 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
// for simple haar paatern // for simple haar paatern
float icvCalcHaarPatternSum_2( float icvCalcHaarPatternSum_2(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float2 *src,
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
@ -116,21 +114,24 @@ float icvCalcHaarPatternSum_2(
F d = 0; F d = 0;
#pragma unroll int2 dx1 = convert_int2_rte(ratio * src[0]);
for (int k = 0; k < 2; ++k) int2 dy1 = convert_int2_rte(ratio * src[1]);
{ int2 dx2 = convert_int2_rte(ratio * src[2]);
int dx1 = convert_int_rte(ratio * src[k][0]); int2 dy2 = convert_int2_rte(ratio * src[3]);
int dy1 = convert_int_rte(ratio * src[k][1]);
int dx2 = convert_int_rte(ratio * src[k][2]);
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0; F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
}
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
return (float)d; return (float)d;
} }
@ -138,7 +139,7 @@ float icvCalcHaarPatternSum_2(
// N = 3 // N = 3
float icvCalcHaarPatternSum_3( float icvCalcHaarPatternSum_3(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float4 *src,
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
@ -149,21 +150,31 @@ float icvCalcHaarPatternSum_3(
F d = 0; F d = 0;
#pragma unroll int4 dx1 = convert_int4_rte(ratio * src[0]);
for (int k = 0; k < 3; ++k) int4 dy1 = convert_int4_rte(ratio * src[1]);
{ int4 dx2 = convert_int4_rte(ratio * src[2]);
int dx1 = convert_int_rte(ratio * src[k][0]); int4 dy2 = convert_int4_rte(ratio * src[3]);
int dy1 = convert_int_rte(ratio * src[k][1]);
int dx2 = convert_int_rte(ratio * src[k][2]);
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0; F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
}
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
return (float)d; return (float)d;
} }
@ -171,7 +182,7 @@ float icvCalcHaarPatternSum_3(
// N = 4 // N = 4
float icvCalcHaarPatternSum_4( float icvCalcHaarPatternSum_4(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float4 *src,
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
@ -182,21 +193,38 @@ float icvCalcHaarPatternSum_4(
F d = 0; F d = 0;
#pragma unroll int4 dx1 = convert_int4_rte(ratio * src[0]);
for (int k = 0; k < 4; ++k) int4 dy1 = convert_int4_rte(ratio * src[1]);
{ int4 dx2 = convert_int4_rte(ratio * src[2]);
int dx1 = convert_int_rte(ratio * src[k][0]); int4 dy2 = convert_int4_rte(ratio * src[3]);
int dy1 = convert_int_rte(ratio * src[k][1]);
int dx2 = convert_int_rte(ratio * src[k][2]);
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0; F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
}
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow );
d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w));
return (float)d; return (float)d;
} }
@ -204,9 +232,9 @@ float icvCalcHaarPatternSum_4(
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Hessian // Hessian
__constant float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} }; __constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) };
__constant float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} }; __constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) };
__constant float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} }; __constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets.
__inline int calcSize(int octave, int layer) __inline int calcSize(int octave, int layer)
{ {
@ -544,30 +572,30 @@ __kernel
} }
// solve 3x3 linear system Ax=b for floating point input // solve 3x3 linear system Ax=b for floating point input
inline bool solve3x3_float(volatile __local const float A[3][3], volatile __local const float b[3], volatile __local float x[3]) inline bool solve3x3_float(volatile __local const float4 *A, volatile __local const float *b, volatile __local float *x)
{ {
float det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y)
- A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x)
+ A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]); + A[0].z * (A[1].x * A[2].y - A[1].y * A[2].x);
if (det != 0) if (det != 0)
{ {
F invdet = 1.0 / det; F invdet = 1.0 / det;
x[0] = invdet * x[0] = invdet *
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - (b[0] * (A[1].y * A[2].z - A[1].z * A[2].y) -
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + A[0].y * (b[1] * A[2].z - A[1].z * b[2] ) +
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )); A[0].z * (b[1] * A[2].y - A[1].y * b[2] ));
x[1] = invdet * x[1] = invdet *
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - (A[0].x * (b[1] * A[2].z - A[1].z * b[2] ) -
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + b[0] * (A[1].x * A[2].z - A[1].z * A[2].x) +
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])); A[0].z * (A[1].x * b[2] - b[1] * A[2].x));
x[2] = invdet * x[2] = invdet *
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - (A[0].x * (A[1].y * b[2] - b[1] * A[2].y) -
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + A[0].y * (A[1].x * b[2] - b[1] * A[2].x) +
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])); b[0] * (A[1].x * A[2].y - A[1].y * A[2].x));
return true; return true;
} }
@ -632,26 +660,26 @@ __kernel
//ds //ds
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]); dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
volatile __local float H[3][3]; volatile __local float4 H[3];
//dxx //dxx
H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2]; H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
//dxy //dxy
H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]); H[0].y= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
//dxs //dxs
H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]); H[0].z= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
//dyx = dxy //dyx = dxy
H[1][0] = H[0][1]; H[1].x = H[0].y;
//dyy //dyy
H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1]; H[1].y = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
//dys //dys
H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]); H[1].z= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
//dsx = dxs //dsx = dxs
H[2][0] = H[0][2]; H[2].x = H[0].z;
//dsy = dys //dsy = dys
H[2][1] = H[1][2]; H[2].y = H[1].z;
//dss //dss
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1]; H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
volatile __local float x[3]; volatile __local float x[3];
@ -737,10 +765,11 @@ __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
0.001707611023448408f, 0.001455130288377404f}; 0.001707611023448408f, 0.001455130288377404f
};
__constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant float2 c_NX[5] = { (float2)(0, 2), (float2)(0, 0), (float2)(2, 4), (float2)(4, 4), (float2)(-1, 1) };
__constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; __constant float2 c_NY[5] = { (float2)(0, 0), (float2)(0, 2), (float2)(4, 4), (float2)(2, 4), (float2)(1, -1) };
void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid) void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid)
{ {
@ -1028,9 +1057,9 @@ inline float linearFilter(
void calc_dx_dy( void calc_dx_dy(
IMAGE_INT8 imgTex, IMAGE_INT8 imgTex,
volatile __local float s_dx_bin[25], volatile __local float *s_dx_bin,
volatile __local float s_dy_bin[25], volatile __local float *s_dy_bin,
volatile __local float s_PATCH[6][6], volatile __local float *s_PATCH,
__global const float* featureX, __global const float* featureX,
__global const float* featureY, __global const float* featureY,
__global const float* featureSize, __global const float* featureSize,
@ -1048,6 +1077,7 @@ void calc_dx_dy(
{ {
descriptor_dir = 0.0f; descriptor_dir = 0.0f;
} }
descriptor_dir *= (float)(CV_PI_F / 180.0f); descriptor_dir *= (float)(CV_PI_F / 180.0f);
/* The sampling intervals and wavelet sized for selecting an orientation /* The sampling intervals and wavelet sized for selecting an orientation
@ -1074,7 +1104,7 @@ void calc_dx_dy(
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1085,16 +1115,16 @@ void calc_dx_dy(
const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = ( const float vx = (
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - s_PATCH[ get_local_id(1) * 6 + get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] + s_PATCH[ get_local_id(1) * 6 + get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) ])
* dw; * dw;
const float vy = ( const float vy = (
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] + s_PATCH[ get_local_id(1) * 6 + get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) s_PATCH[ get_local_id(1) * 6 + get_local_id(0) + 1])
* dw; * dw;
s_dx_bin[tid] = vx; s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy; s_dy_bin[tid] = vy;
@ -1125,11 +1155,8 @@ void reduce_sum25(
{ {
#endif #endif
sdata1[tid] += sdata1[tid + 8]; sdata1[tid] += sdata1[tid + 8];
sdata2[tid] += sdata2[tid + 8]; sdata2[tid] += sdata2[tid + 8];
sdata3[tid] += sdata3[tid + 8]; sdata3[tid] += sdata3[tid + 8];
sdata4[tid] += sdata4[tid + 8]; sdata4[tid] += sdata4[tid + 8];
#if WAVE_SIZE < 8 #if WAVE_SIZE < 8
} }
@ -1189,7 +1216,7 @@ __kernel
volatile __local float sdy[25]; volatile __local float sdy[25];
volatile __local float sdxabs[25]; volatile __local float sdxabs[25];
volatile __local float sdyabs[25]; volatile __local float sdyabs[25];
volatile __local float s_PATCH[6][6]; volatile __local float s_PATCH[6*6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1249,7 +1276,7 @@ __kernel
volatile __local float sd2[25]; volatile __local float sd2[25];
volatile __local float sdabs1[25]; volatile __local float sdabs1[25];
volatile __local float sdabs2[25]; volatile __local float sdabs2[25];
volatile __local float s_PATCH[6][6]; volatile __local float s_PATCH[6*6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1306,7 +1333,6 @@ __kernel
} }
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1322,11 +1348,13 @@ __kernel
} }
} }
} }
void reduce_sum128(volatile __local float* smem, int tid) void reduce_sum128(volatile __local float* smem, int tid)
{ {
#ifndef WAVE_SIZE #ifndef WAVE_SIZE
#define WAVE_SIZE 1 #define WAVE_SIZE 1
#endif #endif
if (tid < 64) if (tid < 64)
{ {
smem[tid] += smem[tid + 64]; smem[tid] += smem[tid + 64];
@ -1374,6 +1402,8 @@ void reduce_sum128(volatile __local float* smem, int tid)
smem[tid] += smem[tid + 1]; smem[tid] += smem[tid + 1];
} }
} }
void reduce_sum64(volatile __local float* smem, int tid) void reduce_sum64(volatile __local float* smem, int tid)
{ {
#ifndef WAVE_SIZE #ifndef WAVE_SIZE
@ -1436,8 +1466,6 @@ __kernel
reduce_sum128(sqDesc, get_local_id(0)); reduce_sum128(sqDesc, get_local_id(0));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// compute length (square root) // compute length (square root)
volatile __local float len; volatile __local float len;
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
@ -1462,7 +1490,6 @@ __kernel
sqDesc[get_local_id(0)] = lookup * lookup; sqDesc[get_local_id(0)] = lookup * lookup;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum64(sqDesc, get_local_id(0)); reduce_sum64(sqDesc, get_local_id(0));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);