Fix ocl compilation error when using Intel OpenCL SDK.

This commit is contained in:
peng xiao 2013-08-01 13:06:33 +08:00
parent 9b5d1596dc
commit fd77a49e76

View File

@ -16,6 +16,7 @@
// //
// @Authors // @Authors
// Peng Xiao, pengxiao@multicorewareinc.com // Peng Xiao, pengxiao@multicorewareinc.com
// Sen Liu, swjtuls1987@126.com
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met: // are permitted provided that the following conditions are met:
@ -43,9 +44,6 @@
// //
//M*/ //M*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
// specialized for non-image2d_t supported platform, intel HD4000, for example // specialized for non-image2d_t supported platform, intel HD4000, for example
#ifdef DISABLE_IMAGE2D #ifdef DISABLE_IMAGE2D
#define IMAGE_INT32 __global uint * #define IMAGE_INT32 __global uint *
@ -105,7 +103,7 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
// for simple haar paatern // for simple haar paatern
float icvCalcHaarPatternSum_2( float icvCalcHaarPatternSum_2(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float2 *src,
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
@ -116,21 +114,24 @@ float icvCalcHaarPatternSum_2(
F d = 0; F d = 0;
#pragma unroll int2 dx1 = convert_int2_rte(ratio * src[0]);
for (int k = 0; k < 2; ++k) int2 dy1 = convert_int2_rte(ratio * src[1]);
{ int2 dx2 = convert_int2_rte(ratio * src[2]);
int dx1 = convert_int_rte(ratio * src[k][0]); int2 dy2 = convert_int2_rte(ratio * src[3]);
int dy1 = convert_int_rte(ratio * src[k][1]);
int dx2 = convert_int_rte(ratio * src[k][2]);
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0; F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
}
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
return (float)d; return (float)d;
} }
@ -138,7 +139,7 @@ float icvCalcHaarPatternSum_2(
// N = 3 // N = 3
float icvCalcHaarPatternSum_3( float icvCalcHaarPatternSum_3(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float4 *src,
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
@ -149,21 +150,31 @@ float icvCalcHaarPatternSum_3(
F d = 0; F d = 0;
#pragma unroll int4 dx1 = convert_int4_rte(ratio * src[0]);
for (int k = 0; k < 3; ++k) int4 dy1 = convert_int4_rte(ratio * src[1]);
{ int4 dx2 = convert_int4_rte(ratio * src[2]);
int dx1 = convert_int_rte(ratio * src[k][0]); int4 dy2 = convert_int4_rte(ratio * src[3]);
int dy1 = convert_int_rte(ratio * src[k][1]);
int dx2 = convert_int_rte(ratio * src[k][2]);
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0; F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
}
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
return (float)d; return (float)d;
} }
@ -171,7 +182,7 @@ float icvCalcHaarPatternSum_3(
// N = 4 // N = 4
float icvCalcHaarPatternSum_4( float icvCalcHaarPatternSum_4(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float4 *src,
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
@ -182,21 +193,38 @@ float icvCalcHaarPatternSum_4(
F d = 0; F d = 0;
#pragma unroll int4 dx1 = convert_int4_rte(ratio * src[0]);
for (int k = 0; k < 4; ++k) int4 dy1 = convert_int4_rte(ratio * src[1]);
{ int4 dx2 = convert_int4_rte(ratio * src[2]);
int dx1 = convert_int_rte(ratio * src[k][0]); int4 dy2 = convert_int4_rte(ratio * src[3]);
int dy1 = convert_int_rte(ratio * src[k][1]);
int dx2 = convert_int_rte(ratio * src[k][2]);
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0; F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow ); t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow ); t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
}
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow );
d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w));
return (float)d; return (float)d;
} }
@ -204,9 +232,9 @@ float icvCalcHaarPatternSum_4(
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Hessian // Hessian
__constant float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} }; __constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) };
__constant float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} }; __constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) };
__constant float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} }; __constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets.
__inline int calcSize(int octave, int layer) __inline int calcSize(int octave, int layer)
{ {
@ -236,7 +264,7 @@ __kernel void icvCalcLayerDetAndTrace(
int c_octave, int c_octave,
int c_layer_rows, int c_layer_rows,
int sumTex_step int sumTex_step
) )
{ {
det_step /= sizeof(*det); det_step /= sizeof(*det);
trace_step /= sizeof(*trace); trace_step /= sizeof(*trace);
@ -300,7 +328,7 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
// Non-maximal suppression to further filtering the candidates from previous step // Non-maximal suppression to further filtering the candidates from previous step
__kernel __kernel
void icvFindMaximaInLayer_withmask( void icvFindMaximaInLayer_withmask(
__global const float * det, __global const float * det,
__global const float * trace, __global const float * trace,
__global int4 * maxPosBuffer, __global int4 * maxPosBuffer,
@ -318,7 +346,7 @@ __kernel
float c_hessianThreshold, float c_hessianThreshold,
IMAGE_INT32 maskSumTex, IMAGE_INT32 maskSumTex,
int mask_step int mask_step
) )
{ {
volatile __local float N9[768]; // threads.x * threads.y * 3 volatile __local float N9[768]; // threads.x * threads.y * 3
@ -428,7 +456,7 @@ __kernel
} }
__kernel __kernel
void icvFindMaximaInLayer( void icvFindMaximaInLayer(
__global float * det, __global float * det,
__global float * trace, __global float * trace,
__global int4 * maxPosBuffer, __global int4 * maxPosBuffer,
@ -444,7 +472,7 @@ __kernel
int c_layer_cols, int c_layer_cols,
int c_max_candidates, int c_max_candidates,
float c_hessianThreshold float c_hessianThreshold
) )
{ {
volatile __local float N9[768]; // threads.x * threads.y * 3 volatile __local float N9[768]; // threads.x * threads.y * 3
@ -544,30 +572,30 @@ __kernel
} }
// solve 3x3 linear system Ax=b for floating point input // solve 3x3 linear system Ax=b for floating point input
inline bool solve3x3_float(volatile __local const float A[3][3], volatile __local const float b[3], volatile __local float x[3]) inline bool solve3x3_float(volatile __local const float4 *A, volatile __local const float *b, volatile __local float *x)
{ {
float det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y)
- A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x)
+ A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]); + A[0].z * (A[1].x * A[2].y - A[1].y * A[2].x);
if (det != 0) if (det != 0)
{ {
F invdet = 1.0 / det; F invdet = 1.0 / det;
x[0] = invdet * x[0] = invdet *
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - (b[0] * (A[1].y * A[2].z - A[1].z * A[2].y) -
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + A[0].y * (b[1] * A[2].z - A[1].z * b[2] ) +
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )); A[0].z * (b[1] * A[2].y - A[1].y * b[2] ));
x[1] = invdet * x[1] = invdet *
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - (A[0].x * (b[1] * A[2].z - A[1].z * b[2] ) -
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + b[0] * (A[1].x * A[2].z - A[1].z * A[2].x) +
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])); A[0].z * (A[1].x * b[2] - b[1] * A[2].x));
x[2] = invdet * x[2] = invdet *
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - (A[0].x * (A[1].y * b[2] - b[1] * A[2].y) -
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + A[0].y * (A[1].x * b[2] - b[1] * A[2].x) +
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])); b[0] * (A[1].x * A[2].y - A[1].y * A[2].x));
return true; return true;
} }
@ -586,7 +614,7 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// INTERPOLATION // INTERPOLATION
__kernel __kernel
void icvInterpolateKeypoint( void icvInterpolateKeypoint(
__global const float * det, __global const float * det,
__global const int4 * maxPosBuffer, __global const int4 * maxPosBuffer,
__global float * keypoints, __global float * keypoints,
@ -598,7 +626,7 @@ __kernel
int c_octave, int c_octave,
int c_layer_rows, int c_layer_rows,
int c_max_features int c_max_features
) )
{ {
det_step /= sizeof(*det); det_step /= sizeof(*det);
keypoints_step /= sizeof(*keypoints); keypoints_step /= sizeof(*keypoints);
@ -632,26 +660,26 @@ __kernel
//ds //ds
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]); dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
volatile __local float H[3][3]; volatile __local float4 H[3];
//dxx //dxx
H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2]; H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
//dxy //dxy
H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]); H[0].y= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
//dxs //dxs
H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]); H[0].z= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
//dyx = dxy //dyx = dxy
H[1][0] = H[0][1]; H[1].x = H[0].y;
//dyy //dyy
H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1]; H[1].y = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
//dys //dys
H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]); H[1].z= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
//dsx = dxs //dsx = dxs
H[2][0] = H[0][2]; H[2].x = H[0].z;
//dsy = dys //dsy = dys
H[2][1] = H[1][2]; H[2].y = H[1].z;
//dss //dss
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1]; H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
volatile __local float x[3]; volatile __local float x[3];
@ -737,10 +765,11 @@ __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
0.001707611023448408f, 0.001455130288377404f}; 0.001707611023448408f, 0.001455130288377404f
};
__constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant float2 c_NX[5] = { (float2)(0, 2), (float2)(0, 0), (float2)(2, 4), (float2)(4, 4), (float2)(-1, 1) };
__constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; __constant float2 c_NY[5] = { (float2)(0, 0), (float2)(0, 2), (float2)(4, 4), (float2)(2, 4), (float2)(1, -1) };
void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid) void reduce_32_sum(volatile __local float * data, volatile float* partial_reduction, int tid)
{ {
@ -759,14 +788,14 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
if (tid < 8) if (tid < 8)
{ {
#endif #endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8]);
#if WAVE_SIZE < 8 #if WAVE_SIZE < 8
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4) if (tid < 4)
{ {
#endif #endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4]);
#if WAVE_SIZE < 4 #if WAVE_SIZE < 4
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -787,14 +816,14 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
} }
__kernel __kernel
void icvCalcOrientation( void icvCalcOrientation(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__global float * keypoints, __global float * keypoints,
int keypoints_step, int keypoints_step,
int c_img_rows, int c_img_rows,
int c_img_cols, int c_img_cols,
int sum_step int sum_step
) )
{ {
keypoints_step /= sizeof(*keypoints); keypoints_step /= sizeof(*keypoints);
sum_step /= sizeof(uint); sum_step /= sizeof(uint);
@ -934,11 +963,11 @@ __kernel
__kernel __kernel
void icvSetUpright( void icvSetUpright(
__global float * keypoints, __global float * keypoints,
int keypoints_step, int keypoints_step,
int nFeatures int nFeatures
) )
{ {
keypoints_step /= sizeof(*keypoints); keypoints_step /= sizeof(*keypoints);
__global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
@ -988,7 +1017,7 @@ inline uchar readerGet(
IMAGE_INT8 src, IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j, int rows, int cols, int elemPerRow int i, int j, int rows, int cols, int elemPerRow
) )
{ {
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
@ -999,7 +1028,7 @@ inline float linearFilter(
IMAGE_INT8 src, IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x, int rows, int cols, int elemPerRow float y, float x, int rows, int cols, int elemPerRow
) )
{ {
x -= 0.5f; x -= 0.5f;
y -= 0.5f; y -= 0.5f;
@ -1028,9 +1057,9 @@ inline float linearFilter(
void calc_dx_dy( void calc_dx_dy(
IMAGE_INT8 imgTex, IMAGE_INT8 imgTex,
volatile __local float s_dx_bin[25], volatile __local float *s_dx_bin,
volatile __local float s_dy_bin[25], volatile __local float *s_dy_bin,
volatile __local float s_PATCH[6][6], volatile __local float *s_PATCH,
__global const float* featureX, __global const float* featureX,
__global const float* featureY, __global const float* featureY,
__global const float* featureSize, __global const float* featureSize,
@ -1038,7 +1067,7 @@ void calc_dx_dy(
int rows, int rows,
int cols, int cols,
int elemPerRow int elemPerRow
) )
{ {
const float centerX = featureX[get_group_id(0)]; const float centerX = featureX[get_group_id(0)];
const float centerY = featureY[get_group_id(0)]; const float centerY = featureY[get_group_id(0)];
@ -1048,6 +1077,7 @@ void calc_dx_dy(
{ {
descriptor_dir = 0.0f; descriptor_dir = 0.0f;
} }
descriptor_dir *= (float)(CV_PI_F / 180.0f); descriptor_dir *= (float)(CV_PI_F / 180.0f);
/* The sampling intervals and wavelet sized for selecting an orientation /* The sampling intervals and wavelet sized for selecting an orientation
@ -1074,7 +1104,7 @@ void calc_dx_dy(
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1085,16 +1115,16 @@ void calc_dx_dy(
const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = ( const float vx = (
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - s_PATCH[ get_local_id(1) * 6 + get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] + s_PATCH[ get_local_id(1) * 6 + get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) ])
* dw; * dw;
const float vy = ( const float vy = (
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] + s_PATCH[ get_local_id(1) * 6 + get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) s_PATCH[ get_local_id(1) * 6 + get_local_id(0) + 1])
* dw; * dw;
s_dx_bin[tid] = vx; s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy; s_dy_bin[tid] = vy;
@ -1106,7 +1136,7 @@ void reduce_sum25(
volatile __local float* sdata3, volatile __local float* sdata3,
volatile __local float* sdata4, volatile __local float* sdata4,
int tid int tid
) )
{ {
#ifndef WAVE_SIZE #ifndef WAVE_SIZE
#define WAVE_SIZE 1 #define WAVE_SIZE 1
@ -1125,11 +1155,8 @@ void reduce_sum25(
{ {
#endif #endif
sdata1[tid] += sdata1[tid + 8]; sdata1[tid] += sdata1[tid + 8];
sdata2[tid] += sdata2[tid + 8]; sdata2[tid] += sdata2[tid + 8];
sdata3[tid] += sdata3[tid + 8]; sdata3[tid] += sdata3[tid + 8];
sdata4[tid] += sdata4[tid + 8]; sdata4[tid] += sdata4[tid + 8];
#if WAVE_SIZE < 8 #if WAVE_SIZE < 8
} }
@ -1166,7 +1193,7 @@ void reduce_sum25(
} }
__kernel __kernel
void compute_descriptors64( void compute_descriptors64(
IMAGE_INT8 imgTex, IMAGE_INT8 imgTex,
__global float * descriptors, __global float * descriptors,
__global const float * keypoints, __global const float * keypoints,
@ -1175,7 +1202,7 @@ __kernel
int rows, int rows,
int cols, int cols,
int img_step int img_step
) )
{ {
descriptors_step /= sizeof(float); descriptors_step /= sizeof(float);
keypoints_step /= sizeof(float); keypoints_step /= sizeof(float);
@ -1189,7 +1216,7 @@ __kernel
volatile __local float sdy[25]; volatile __local float sdy[25];
volatile __local float sdxabs[25]; volatile __local float sdxabs[25];
volatile __local float sdyabs[25]; volatile __local float sdyabs[25];
volatile __local float s_PATCH[6][6]; volatile __local float s_PATCH[6*6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1221,7 +1248,7 @@ __kernel
} }
} }
__kernel __kernel
void compute_descriptors128( void compute_descriptors128(
IMAGE_INT8 imgTex, IMAGE_INT8 imgTex,
__global float * descriptors, __global float * descriptors,
__global float * keypoints, __global float * keypoints,
@ -1230,7 +1257,7 @@ __kernel
int rows, int rows,
int cols, int cols,
int img_step int img_step
) )
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
keypoints_step /= sizeof(*keypoints); keypoints_step /= sizeof(*keypoints);
@ -1249,7 +1276,7 @@ __kernel
volatile __local float sd2[25]; volatile __local float sd2[25];
volatile __local float sdabs1[25]; volatile __local float sdabs1[25];
volatile __local float sdabs2[25]; volatile __local float sdabs2[25];
volatile __local float s_PATCH[6][6]; volatile __local float s_PATCH[6*6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1306,7 +1333,6 @@ __kernel
} }
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1322,11 +1348,13 @@ __kernel
} }
} }
} }
void reduce_sum128(volatile __local float* smem, int tid) void reduce_sum128(volatile __local float* smem, int tid)
{ {
#ifndef WAVE_SIZE #ifndef WAVE_SIZE
#define WAVE_SIZE 1 #define WAVE_SIZE 1
#endif #endif
if (tid < 64) if (tid < 64)
{ {
smem[tid] += smem[tid + 64]; smem[tid] += smem[tid + 64];
@ -1374,6 +1402,8 @@ void reduce_sum128(volatile __local float* smem, int tid)
smem[tid] += smem[tid + 1]; smem[tid] += smem[tid + 1];
} }
} }
void reduce_sum64(volatile __local float* smem, int tid) void reduce_sum64(volatile __local float* smem, int tid)
{ {
#ifndef WAVE_SIZE #ifndef WAVE_SIZE
@ -1421,7 +1451,7 @@ void reduce_sum64(volatile __local float* smem, int tid)
} }
__kernel __kernel
void normalize_descriptors128(__global float * descriptors, int descriptors_step) void normalize_descriptors128(__global float * descriptors, int descriptors_step)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
// no need for thread ID // no need for thread ID
@ -1436,8 +1466,6 @@ __kernel
reduce_sum128(sqDesc, get_local_id(0)); reduce_sum128(sqDesc, get_local_id(0));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// compute length (square root) // compute length (square root)
volatile __local float len; volatile __local float len;
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
@ -1450,7 +1478,7 @@ __kernel
descriptor_base[get_local_id(0)] = lookup / len; descriptor_base[get_local_id(0)] = lookup / len;
} }
__kernel __kernel
void normalize_descriptors64(__global float * descriptors, int descriptors_step) void normalize_descriptors64(__global float * descriptors, int descriptors_step)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
// no need for thread ID // no need for thread ID
@ -1462,7 +1490,6 @@ __kernel
sqDesc[get_local_id(0)] = lookup * lookup; sqDesc[get_local_id(0)] = lookup * lookup;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum64(sqDesc, get_local_id(0)); reduce_sum64(sqDesc, get_local_id(0));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);