remove the device specific logic
This commit is contained in:
parent
620c699456
commit
ea433cc496
@ -43,10 +43,39 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
// specialized for non-image2d_t supported platform, intel HD4000, for example
|
||||
#ifdef DISABLE_IMAGE2D
|
||||
#define IMAGE_INT32 __global uint *
|
||||
#define IMAGE_INT8 __global uchar *
|
||||
#else
|
||||
#define IMAGE_INT32 image2d_t
|
||||
#define IMAGE_INT8 image2d_t
|
||||
#endif
|
||||
|
||||
uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
|
||||
{
|
||||
#ifdef DISABLE_IMAGE2D
|
||||
int x = clamp(coord.x, 0, cols);
|
||||
int y = clamp(coord.y, 0, rows);
|
||||
return img[elemPerRow * y + x];
|
||||
#else
|
||||
return read_imageui(img, sam, coord).x;
|
||||
#endif
|
||||
}
|
||||
uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
|
||||
{
|
||||
#ifdef DISABLE_IMAGE2D
|
||||
int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
|
||||
int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
|
||||
return img[elemPerRow * y + x];
|
||||
#else
|
||||
return (uchar)read_imageui(img, sam, coord).x;
|
||||
#endif
|
||||
}
|
||||
|
||||
// dynamically change the precision used for floating type
|
||||
|
||||
#if defined (__ATI__) || defined (__NVIDIA__)
|
||||
@ -58,14 +87,24 @@
|
||||
// Image read mode
|
||||
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
|
||||
#ifndef FLT_EPSILON
|
||||
#define FLT_EPSILON (1e-15)
|
||||
#define CV_PI_F 3.14159265f
|
||||
#endif
|
||||
|
||||
#ifndef CV_PI_F
|
||||
#define CV_PI_F 3.14159265f
|
||||
#endif
|
||||
|
||||
// Use integral image to calculate haar wavelets.
|
||||
// N = 2
|
||||
// for simple haar paatern
|
||||
float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x)
|
||||
float icvCalcHaarPatternSum_2(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
float ratio = (float)newSize / oldSize;
|
||||
@ -81,11 +120,10 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
|
||||
int dy2 = convert_int_rte(ratio * src[k][3]);
|
||||
|
||||
F t = 0;
|
||||
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
|
||||
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
|
||||
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
|
||||
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
|
||||
|
||||
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
|
||||
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
|
||||
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
|
||||
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
|
||||
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||
}
|
||||
|
||||
@ -93,7 +131,13 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
|
||||
}
|
||||
|
||||
// N = 3
|
||||
float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x)
|
||||
float icvCalcHaarPatternSum_3(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
float ratio = (float)newSize / oldSize;
|
||||
@ -109,11 +153,10 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
|
||||
int dy2 = convert_int_rte(ratio * src[k][3]);
|
||||
|
||||
F t = 0;
|
||||
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
|
||||
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
|
||||
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
|
||||
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
|
||||
|
||||
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
|
||||
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
|
||||
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
|
||||
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
|
||||
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||
}
|
||||
|
||||
@ -121,7 +164,13 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
|
||||
}
|
||||
|
||||
// N = 4
|
||||
float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x)
|
||||
float icvCalcHaarPatternSum_4(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
float ratio = (float)newSize / oldSize;
|
||||
@ -137,11 +186,10 @@ float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int
|
||||
int dy2 = convert_int_rte(ratio * src[k][3]);
|
||||
|
||||
F t = 0;
|
||||
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
|
||||
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
|
||||
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
|
||||
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
|
||||
|
||||
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
|
||||
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
|
||||
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
|
||||
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
|
||||
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||
}
|
||||
|
||||
@ -172,7 +220,7 @@ __inline int calcSize(int octave, int layer)
|
||||
|
||||
//calculate targeted layer per-pixel determinant and trace with an integral image
|
||||
__kernel void icvCalcLayerDetAndTrace(
|
||||
image2d_t sumTex, // input integral image
|
||||
IMAGE_INT32 sumTex, // input integral image
|
||||
__global float * det, // output Determinant
|
||||
__global float * trace, // output trace
|
||||
int det_step, // the step of det in bytes
|
||||
@ -181,11 +229,13 @@ __kernel void icvCalcLayerDetAndTrace(
|
||||
int c_img_cols,
|
||||
int c_nOctaveLayers,
|
||||
int c_octave,
|
||||
int c_layer_rows
|
||||
int c_layer_rows,
|
||||
int sumTex_step
|
||||
)
|
||||
{
|
||||
det_step /= sizeof(*det);
|
||||
trace_step /= sizeof(*trace);
|
||||
sumTex_step/= sizeof(uint);
|
||||
// Determine the indices
|
||||
const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2);
|
||||
const int blockIdx_y = get_group_id(1) % gridDim_y;
|
||||
@ -205,12 +255,12 @@ __kernel void icvCalcLayerDetAndTrace(
|
||||
|
||||
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
|
||||
{
|
||||
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
|
||||
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
|
||||
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
|
||||
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
|
||||
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
|
||||
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
|
||||
|
||||
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
|
||||
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
|
||||
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
|
||||
}
|
||||
}
|
||||
|
||||
@ -220,7 +270,7 @@ __kernel void icvCalcLayerDetAndTrace(
|
||||
|
||||
__constant float c_DM[5] = {0, 0, 9, 9, 1};
|
||||
|
||||
bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
|
||||
bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
|
||||
{
|
||||
float ratio = (float)size / 9.0f;
|
||||
|
||||
@ -233,10 +283,10 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
|
||||
|
||||
float t = 0;
|
||||
|
||||
t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x;
|
||||
t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x;
|
||||
t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x;
|
||||
t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x;
|
||||
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
|
||||
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
|
||||
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
|
||||
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
|
||||
|
||||
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||
|
||||
@ -246,9 +296,9 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
|
||||
// Non-maximal suppression to further filtering the candidates from previous step
|
||||
__kernel
|
||||
void icvFindMaximaInLayer_withmask(
|
||||
__global const float * det,
|
||||
__global const float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
__global const float * det,
|
||||
__global const float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
volatile __global unsigned int* maxCounter,
|
||||
int counter_offset,
|
||||
int det_step, // the step of det in bytes
|
||||
@ -261,7 +311,8 @@ __kernel
|
||||
int c_layer_cols,
|
||||
int c_max_candidates,
|
||||
float c_hessianThreshold,
|
||||
image2d_t maskSumTex
|
||||
IMAGE_INT32 maskSumTex,
|
||||
int mask_step
|
||||
)
|
||||
{
|
||||
volatile __local float N9[768]; // threads.x * threads.y * 3
|
||||
@ -269,6 +320,7 @@ __kernel
|
||||
det_step /= sizeof(*det);
|
||||
trace_step /= sizeof(*trace);
|
||||
maxCounter += counter_offset;
|
||||
mask_step /= sizeof(uint);
|
||||
|
||||
// Determine the indices
|
||||
const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
|
||||
@ -288,26 +340,26 @@ __kernel
|
||||
// Is this thread within the hessian buffer?
|
||||
const int zoff = get_local_size(0) * get_local_size(1);
|
||||
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
|
||||
N9[localLin - zoff] =
|
||||
det[det_step *
|
||||
N9[localLin - zoff] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
N9[localLin ] =
|
||||
det[det_step *
|
||||
N9[localLin ] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
N9[localLin + zoff] =
|
||||
det[det_step *
|
||||
N9[localLin + zoff] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (i < c_layer_rows - margin
|
||||
if (i < c_layer_rows - margin
|
||||
&& j < c_layer_cols - margin
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) < get_local_size(0) - 1
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
|
||||
)
|
||||
{
|
||||
@ -321,7 +373,7 @@ __kernel
|
||||
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
|
||||
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
|
||||
|
||||
if (within_check(maskSumTex, sum_i, sum_j, size))
|
||||
if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
|
||||
{
|
||||
// Check to see if we have a max (in its 26 neighbours)
|
||||
const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
|
||||
@ -372,9 +424,9 @@ __kernel
|
||||
|
||||
__kernel
|
||||
void icvFindMaximaInLayer(
|
||||
__global float * det,
|
||||
__global float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
__global float * det,
|
||||
__global float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
volatile __global unsigned int* maxCounter,
|
||||
int counter_offset,
|
||||
int det_step, // the step of det in bytes
|
||||
@ -417,19 +469,19 @@ __kernel
|
||||
int l_x = min(max(j, 0), c_img_cols - 1);
|
||||
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
|
||||
|
||||
N9[localLin - zoff] =
|
||||
N9[localLin - zoff] =
|
||||
det[det_step * (l_y - c_layer_rows) + l_x];
|
||||
N9[localLin ] =
|
||||
N9[localLin ] =
|
||||
det[det_step * (l_y ) + l_x];
|
||||
N9[localLin + zoff] =
|
||||
N9[localLin + zoff] =
|
||||
det[det_step * (l_y + c_layer_rows) + l_x];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (i < c_layer_rows - margin
|
||||
if (i < c_layer_rows - margin
|
||||
&& j < c_layer_cols - margin
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) < get_local_size(0) - 1
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
|
||||
)
|
||||
{
|
||||
@ -497,17 +549,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
|
||||
{
|
||||
F invdet = 1.0 / det;
|
||||
|
||||
x[0] = invdet *
|
||||
x[0] = invdet *
|
||||
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
|
||||
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
|
||||
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
|
||||
|
||||
x[1] = invdet *
|
||||
x[1] = invdet *
|
||||
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
|
||||
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
|
||||
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
|
||||
|
||||
x[2] = invdet *
|
||||
x[2] = invdet *
|
||||
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
|
||||
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
|
||||
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
|
||||
@ -528,9 +580,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// INTERPOLATION
|
||||
__kernel
|
||||
__kernel
|
||||
void icvInterpolateKeypoint(
|
||||
__global const float * det,
|
||||
__global const float * det,
|
||||
__global const int4 * maxPosBuffer,
|
||||
__global float * keypoints,
|
||||
volatile __global unsigned int * featureCounter,
|
||||
@ -560,7 +612,7 @@ __kernel
|
||||
|
||||
volatile __local float N9[3][3][3];
|
||||
|
||||
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
|
||||
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
|
||||
det[det_step * (c_layer_rows * layer + i) + j];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@ -658,27 +710,27 @@ __kernel
|
||||
|
||||
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
|
||||
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
|
||||
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
|
||||
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
|
||||
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
|
||||
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
|
||||
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
|
||||
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
|
||||
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
|
||||
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
|
||||
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
|
||||
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
|
||||
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
|
||||
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
|
||||
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
|
||||
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
|
||||
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
|
||||
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
|
||||
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
|
||||
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
|
||||
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
|
||||
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
|
||||
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
|
||||
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
|
||||
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
|
||||
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
|
||||
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
|
||||
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
|
||||
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
|
||||
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
|
||||
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
|
||||
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
|
||||
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
|
||||
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
|
||||
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
|
||||
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
|
||||
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
|
||||
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
|
||||
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
|
||||
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
|
||||
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
|
||||
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
|
||||
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
|
||||
0.001707611023448408f, 0.001455130288377404f};
|
||||
|
||||
@ -691,27 +743,29 @@ void reduce_32_sum(volatile __local float * data, float partial_reduction, int
|
||||
data[tid] = partial_reduction;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 16)
|
||||
if (tid < 16)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
#undef op
|
||||
}
|
||||
|
||||
__kernel
|
||||
void icvCalcOrientation(
|
||||
image2d_t sumTex,
|
||||
IMAGE_INT32 sumTex,
|
||||
__global float * keypoints,
|
||||
int keypoints_step,
|
||||
int c_img_rows,
|
||||
int c_img_cols
|
||||
int c_img_cols,
|
||||
int sum_step
|
||||
)
|
||||
{
|
||||
keypoints_step /= sizeof(*keypoints);
|
||||
sum_step /= sizeof(uint);
|
||||
__global float* featureX = keypoints + X_ROW * keypoints_step;
|
||||
__global float* featureY = keypoints + Y_ROW * keypoints_step;
|
||||
__global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
|
||||
@ -754,8 +808,8 @@ __kernel
|
||||
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
|
||||
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
|
||||
{
|
||||
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x);
|
||||
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
|
||||
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
|
||||
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
|
||||
|
||||
angle = atan2(Y, X);
|
||||
|
||||
@ -881,20 +935,20 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
|
||||
|
||||
// utility for linear filter
|
||||
inline uchar readerGet(
|
||||
image2d_t src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
int i, int j
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
int i, int j, int rows, int cols, int elemPerRow
|
||||
)
|
||||
{
|
||||
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
|
||||
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
|
||||
return (uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x;
|
||||
return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
|
||||
}
|
||||
|
||||
inline float linearFilter(
|
||||
image2d_t src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
float y, float x
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
float y, float x, int rows, int cols, int elemPerRow
|
||||
)
|
||||
{
|
||||
x -= 0.5f;
|
||||
@ -907,30 +961,33 @@ inline float linearFilter(
|
||||
const int x2 = x1 + 1;
|
||||
const int y2 = y1 + 1;
|
||||
|
||||
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
|
||||
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
|
||||
out = out + src_reg * ((x2 - x) * (y2 - y));
|
||||
|
||||
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
|
||||
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
|
||||
out = out + src_reg * ((x - x1) * (y2 - y));
|
||||
|
||||
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
|
||||
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
|
||||
out = out + src_reg * ((x2 - x) * (y - y1));
|
||||
|
||||
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
|
||||
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
|
||||
out = out + src_reg * ((x - x1) * (y - y1));
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void calc_dx_dy(
|
||||
image2d_t imgTex,
|
||||
IMAGE_INT8 imgTex,
|
||||
volatile __local float s_dx_bin[25],
|
||||
volatile __local float s_dy_bin[25],
|
||||
volatile __local float s_PATCH[6][6],
|
||||
__global const float* featureX,
|
||||
__global const float* featureY,
|
||||
__global const float* featureSize,
|
||||
__global const float* featureDir
|
||||
__global const float* featureX,
|
||||
__global const float* featureY,
|
||||
__global const float* featureSize,
|
||||
__global const float* featureDir,
|
||||
int rows,
|
||||
int cols,
|
||||
int elemPerRow
|
||||
)
|
||||
{
|
||||
const float centerX = featureX[get_group_id(0)];
|
||||
@ -965,7 +1022,7 @@ void calc_dx_dy(
|
||||
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
|
||||
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
|
||||
|
||||
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo);
|
||||
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@ -976,26 +1033,26 @@ void calc_dx_dy(
|
||||
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
|
||||
|
||||
const float vx = (
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
|
||||
* dw;
|
||||
const float vy = (
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
|
||||
* dw;
|
||||
s_dx_bin[tid] = vx;
|
||||
s_dy_bin[tid] = vy;
|
||||
}
|
||||
}
|
||||
void reduce_sum25(
|
||||
volatile __local float* sdata1,
|
||||
volatile __local float* sdata2,
|
||||
volatile __local float* sdata3,
|
||||
volatile __local float* sdata4,
|
||||
volatile __local float* sdata1,
|
||||
volatile __local float* sdata2,
|
||||
volatile __local float* sdata3,
|
||||
volatile __local float* sdata4,
|
||||
int tid
|
||||
)
|
||||
{
|
||||
@ -1033,18 +1090,20 @@ void reduce_sum25(
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__kernel
|
||||
void compute_descriptors64(
|
||||
image2d_t imgTex,
|
||||
volatile __global float * descriptors,
|
||||
IMAGE_INT8 imgTex,
|
||||
volatile __global float * descriptors,
|
||||
__global const float * keypoints,
|
||||
int descriptors_step,
|
||||
int keypoints_step
|
||||
int keypoints_step,
|
||||
int rows,
|
||||
int cols,
|
||||
int img_step
|
||||
)
|
||||
{
|
||||
descriptors_step /= sizeof(float);
|
||||
keypoints_step /= sizeof(float);
|
||||
|
||||
__global const float * featureX = keypoints + X_ROW * keypoints_step;
|
||||
__global const float * featureY = keypoints + Y_ROW * keypoints_step;
|
||||
__global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
|
||||
@ -1057,7 +1116,7 @@ __kernel
|
||||
volatile __local float sdyabs[25];
|
||||
volatile __local float s_PATCH[6][6];
|
||||
|
||||
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
|
||||
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
|
||||
@ -1066,10 +1125,10 @@ __kernel
|
||||
{
|
||||
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
|
||||
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
|
||||
|
||||
@ -1083,13 +1142,16 @@ __kernel
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel
|
||||
__kernel
|
||||
void compute_descriptors128(
|
||||
image2d_t imgTex,
|
||||
__global volatile float * descriptors,
|
||||
IMAGE_INT8 imgTex,
|
||||
__global volatile float * descriptors,
|
||||
__global float * keypoints,
|
||||
int descriptors_step,
|
||||
int keypoints_step
|
||||
int keypoints_step,
|
||||
int rows,
|
||||
int cols,
|
||||
int img_step
|
||||
)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
||||
@ -1111,7 +1173,7 @@ __kernel
|
||||
volatile __local float sdabs2[25];
|
||||
volatile __local float s_PATCH[6][6];
|
||||
|
||||
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
|
||||
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
|
||||
@ -1132,10 +1194,10 @@ __kernel
|
||||
sd2[tid] = sdx[tid];
|
||||
sdabs2[tid] = fabs(sdx[tid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
|
||||
|
||||
@ -1162,10 +1224,10 @@ __kernel
|
||||
sd2[tid] = sdy[tid];
|
||||
sdabs2[tid] = fabs(sdy[tid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
|
||||
if (tid == 0)
|
||||
@ -1178,7 +1240,7 @@ __kernel
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__kernel
|
||||
void normalize_descriptors128(__global float * descriptors, int descriptors_step)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
||||
@ -1219,7 +1281,7 @@ __kernel
|
||||
// normalize and store in output
|
||||
descriptor_base[get_local_id(0)] = lookup / len;
|
||||
}
|
||||
__kernel
|
||||
__kernel
|
||||
void normalize_descriptors64(__global float * descriptors, int descriptors_step)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
||||
|
@ -1,4 +1,4 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
/*M/////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
@ -44,6 +44,7 @@
|
||||
//M*/
|
||||
#include <iomanip>
|
||||
#include "precomp.hpp"
|
||||
#include "mcwutil.hpp"
|
||||
//#include "opencv2/highgui/highgui.hpp"
|
||||
|
||||
using namespace cv;
|
||||
@ -71,7 +72,7 @@ static inline int calcSize(int octave, int layer)
|
||||
|
||||
/* Wavelet size increment between layers. This should be an even number,
|
||||
such that the wavelet sizes in an octave are either all even or all odd.
|
||||
This ensures that when looking for the neighbours of a sample, the layers
|
||||
This ensures that when looking for the neighbors of a sample, the layers
|
||||
|
||||
above and below are aligned correctly. */
|
||||
const int HAAR_SIZE_INC = 6;
|
||||
@ -79,6 +80,41 @@ static inline int calcSize(int octave, int layer)
|
||||
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
const char* noImage2dOption = "-D DISABLE_IMAGE2D";
|
||||
|
||||
// default kernel name can be any kernel in nonfree_surf.cl
|
||||
bool support_image2d(const char* kernel_name = "icvCalcLayerDetAndTrace")
|
||||
{
|
||||
static bool _isTested = false;
|
||||
static bool _support = false;
|
||||
if(_isTested)
|
||||
{
|
||||
return _support;
|
||||
}
|
||||
try
|
||||
{
|
||||
cv::ocl::openCLGetKernelFromSource(Context::getContext(), &nonfree_surf, kernel_name);
|
||||
_support = true;
|
||||
}
|
||||
catch (cv::Exception& e)
|
||||
{
|
||||
if(e.code == -217)
|
||||
{
|
||||
_support = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
// throw e once again
|
||||
cv::error(e);
|
||||
}
|
||||
}
|
||||
_isTested = true;
|
||||
return _support;
|
||||
}
|
||||
}
|
||||
|
||||
class SURF_OCL_Invoker
|
||||
{
|
||||
public:
|
||||
@ -88,7 +124,7 @@ public:
|
||||
//void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
|
||||
//void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
|
||||
|
||||
// kernel callers declearations
|
||||
// kernel callers declarations
|
||||
void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
|
||||
|
||||
void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
|
||||
@ -100,14 +136,14 @@ public:
|
||||
void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
|
||||
|
||||
void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
|
||||
// end of kernel callers declearations
|
||||
// end of kernel callers declarations
|
||||
|
||||
|
||||
SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
|
||||
surf_(surf),
|
||||
img_cols(img.cols), img_rows(img.rows),
|
||||
use_mask(!mask.empty()),
|
||||
imgTex(NULL), sumTex(NULL), maskSumTex(NULL)
|
||||
use_mask(!mask.empty()), counters(oclMat()),
|
||||
imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
|
||||
{
|
||||
CV_Assert(!img.empty() && img.type() == CV_8UC1);
|
||||
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
|
||||
@ -131,12 +167,13 @@ public:
|
||||
counters.create(1, surf_.nOctaves + 1, CV_32SC1);
|
||||
counters.setTo(Scalar::all(0));
|
||||
|
||||
//loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
|
||||
integral(img, surf_.sum);
|
||||
if(support_image2d())
|
||||
{
|
||||
bindImgTex(img, imgTex);
|
||||
bindImgTex(surf_.sum, sumTex);
|
||||
}
|
||||
|
||||
bindImgTex(img, imgTex);
|
||||
integral(img, surf_.sum); // the two argumented integral version is incorrect
|
||||
|
||||
bindImgTex(surf_.sum, sumTex);
|
||||
maskSumTex = 0;
|
||||
|
||||
if (use_mask)
|
||||
@ -155,7 +192,7 @@ public:
|
||||
void detectKeypoints(oclMat &keypoints)
|
||||
{
|
||||
// create image pyramid buffers
|
||||
// different layers have same sized buffers, but they are sampled from gaussin kernel.
|
||||
// different layers have same sized buffers, but they are sampled from Gaussian kernel.
|
||||
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
|
||||
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
|
||||
|
||||
@ -222,7 +259,6 @@ public:
|
||||
openCLFree(sumTex);
|
||||
if(maskSumTex)
|
||||
openCLFree(maskSumTex);
|
||||
additioalParamBuffer.release();
|
||||
}
|
||||
|
||||
private:
|
||||
@ -236,13 +272,13 @@ private:
|
||||
int maxFeatures;
|
||||
|
||||
oclMat counters;
|
||||
|
||||
|
||||
// texture buffers
|
||||
cl_mem imgTex;
|
||||
cl_mem sumTex;
|
||||
cl_mem maskSumTex;
|
||||
|
||||
oclMat additioalParamBuffer;
|
||||
const oclMat _img; // make a copy for non-image2d_t supported platform
|
||||
|
||||
SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
|
||||
{
|
||||
@ -362,11 +398,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
|
||||
{
|
||||
if (!img.empty())
|
||||
{
|
||||
if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos)
|
||||
{
|
||||
cout << " Intel HD GPU device unsupported " << endl;
|
||||
return;
|
||||
}
|
||||
SURF_OCL_Invoker surf(*this, img, mask);
|
||||
|
||||
surf.detectKeypoints(keypoints);
|
||||
@ -378,11 +409,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
|
||||
{
|
||||
if (!img.empty())
|
||||
{
|
||||
if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos)
|
||||
{
|
||||
cout << " Intel HD GPU device unsupported " << endl;
|
||||
return;
|
||||
}
|
||||
SURF_OCL_Invoker surf(*this, img, mask);
|
||||
|
||||
if (!useProvidedKeypoints)
|
||||
@ -443,74 +469,11 @@ void cv::ocl::SURF_OCL::releaseMemory()
|
||||
// bind source buffer to image oject.
|
||||
void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
|
||||
{
|
||||
cl_image_format format;
|
||||
int err;
|
||||
int depth = img.depth();
|
||||
int channels = img.channels();
|
||||
|
||||
switch(depth)
|
||||
{
|
||||
case CV_8U:
|
||||
format.image_channel_data_type = CL_UNSIGNED_INT8;
|
||||
break;
|
||||
case CV_32S:
|
||||
format.image_channel_data_type = CL_UNSIGNED_INT32;
|
||||
break;
|
||||
case CV_32F:
|
||||
format.image_channel_data_type = CL_FLOAT;
|
||||
break;
|
||||
default:
|
||||
throw std::exception();
|
||||
break;
|
||||
}
|
||||
switch(channels)
|
||||
{
|
||||
case 1:
|
||||
format.image_channel_order = CL_R;
|
||||
break;
|
||||
case 3:
|
||||
format.image_channel_order = CL_RGB;
|
||||
break;
|
||||
case 4:
|
||||
format.image_channel_order = CL_RGBA;
|
||||
break;
|
||||
default:
|
||||
throw std::exception();
|
||||
break;
|
||||
}
|
||||
if(texture)
|
||||
{
|
||||
openCLFree(texture);
|
||||
}
|
||||
|
||||
#ifdef CL_VERSION_1_2
|
||||
cl_image_desc desc;
|
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
|
||||
desc.image_width = img.step / img.elemSize();
|
||||
desc.image_height = img.rows;
|
||||
desc.image_depth = 0;
|
||||
desc.image_array_size = 1;
|
||||
desc.image_row_pitch = 0;
|
||||
desc.image_slice_pitch = 0;
|
||||
desc.buffer = NULL;
|
||||
desc.num_mip_levels = 0;
|
||||
desc.num_samples = 0;
|
||||
texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
|
||||
#else
|
||||
texture = clCreateImage2D(
|
||||
Context::getContext()->impl->clContext,
|
||||
CL_MEM_READ_WRITE,
|
||||
&format,
|
||||
img.step / img.elemSize(),
|
||||
img.rows,
|
||||
0,
|
||||
NULL,
|
||||
&err);
|
||||
#endif
|
||||
size_t origin[] = { 0, 0, 0 };
|
||||
size_t region[] = { img.step / img.elemSize(), img.rows, 1 };
|
||||
clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0);
|
||||
openCLSafeCall(err);
|
||||
texture = bindTexture(img);
|
||||
}
|
||||
|
||||
////////////////////////////
|
||||
@ -525,7 +488,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
|
||||
string kernelName = "icvCalcLayerDetAndTrace";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
|
||||
if(sumTex)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
|
||||
}
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
|
||||
@ -535,6 +505,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
|
||||
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
size_t globalThreads[3] =
|
||||
@ -543,7 +514,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
|
||||
divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
|
||||
1
|
||||
};
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
if(support_image2d())
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
|
||||
}
|
||||
}
|
||||
|
||||
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
|
||||
@ -573,16 +551,30 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
|
||||
|
||||
if(use_mask)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
|
||||
if(maskSumTex)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
|
||||
}
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
|
||||
}
|
||||
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
|
||||
divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
if(support_image2d())
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
|
||||
}
|
||||
}
|
||||
|
||||
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
|
||||
@ -607,7 +599,14 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
|
||||
size_t localThreads[3] = {3, 3, 3};
|
||||
size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
|
||||
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
if(support_image2d())
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
|
||||
}
|
||||
}
|
||||
|
||||
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
|
||||
@ -617,16 +616,31 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
|
||||
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
|
||||
if(sumTex)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
|
||||
}
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
|
||||
|
||||
size_t localThreads[3] = {32, 4, 1};
|
||||
size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
|
||||
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
if(support_image2d())
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
|
||||
}
|
||||
}
|
||||
|
||||
void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
|
||||
@ -649,12 +663,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
globalThreads[1] = 16 * localThreads[1];
|
||||
|
||||
args.clear();
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
|
||||
if(imgTex)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
|
||||
}
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
|
||||
if(support_image2d())
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
|
||||
}
|
||||
|
||||
kernelName = "normalize_descriptors64";
|
||||
|
||||
@ -667,7 +698,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
args.clear();
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
if(support_image2d())
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -680,12 +718,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
globalThreads[1] = 16 * localThreads[1];
|
||||
|
||||
args.clear();
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
|
||||
if(imgTex)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&_img.data));
|
||||
}
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
|
||||
if(support_image2d())
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
|
||||
}
|
||||
|
||||
kernelName = "normalize_descriptors128";
|
||||
|
||||
@ -698,7 +753,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
args.clear();
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
if(support_image2d())
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user