From c18d1ee2a91685d5279a7db4b4e2b6133415071b Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 4 Feb 2014 20:00:51 +0400 Subject: [PATCH] SURF kind of works (let's see if the tests pass) --- modules/core/include/opencv2/core/ocl.hpp | 2 +- modules/nonfree/src/opencl/surf.cl | 477 +++++++--------------- modules/nonfree/src/surf.cpp | 2 +- modules/nonfree/src/surf.hpp | 16 +- modules/nonfree/src/surf.ocl.cpp | 176 +++----- 5 files changed, 208 insertions(+), 465 deletions(-) diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index 8d9400224..44235693c 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -585,7 +585,7 @@ class CV_EXPORTS Image2D { public: Image2D(); - Image2D(const UMat &src); + explicit Image2D(const UMat &src); ~Image2D(); void* ptr() const; diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl index c7c4c7d68..b038ef00e 100644 --- a/modules/nonfree/src/opencl/surf.cl +++ b/modules/nonfree/src/opencl/surf.cl @@ -52,35 +52,52 @@ #define ORI_LOCAL_SIZE (360 / ORI_SEARCH_INC) // specialized for non-image2d_t supported platform, intel HD4000, for example -#ifdef DISABLE_IMAGE2D -#define IMAGE_INT32 __global uint * -#define IMAGE_INT8 __global uchar * -#else -#define IMAGE_INT32 image2d_t -#define IMAGE_INT8 image2d_t -#endif +#ifndef HAVE_IMAGE2D +__inline uint read_sumTex_(__global uint* sumTex, int sum_step, int img_rows, int img_cols, int2 coord) +{ + int x = clamp(coord.x, 0, img_cols); + int y = clamp(coord.y, 0, img_rows); + return sumTex[sum_step * y + x]; +} -uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow) +__inline uchar read_imgTex_(__global uchar* imgTex, int img_step, int img_rows, int img_cols, float2 coord) { -#ifdef DISABLE_IMAGE2D - int x = clamp(coord.x, 0, cols); - int y = clamp(coord.y, 0, rows); - return img[elemPerRow * y + x]; -#else - return read_imageui(img, sam, coord).x; -#endif + int x = clamp(convert_int_rte(coord.x), 0, img_cols-1); + int y = clamp(convert_int_rte(coord.y), 0, img_rows-1); + return imgTex[img_step * y + x]; } -uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow) + +#define read_sumTex(coord) read_sumTex_(sumTex, sum_step, img_rows, img_cols, coord) +#define read_imgTex(coord) read_imgTex_(imgTex, img_step, img_rows, img_cols, coord) + +#define __PARAM_sumTex__ __global uint* sumTex, int sum_step, int sum_offset +#define __PARAM_imgTex__ __global uchar* imgTex, int img_step, int img_offset + +#define __PASS_sumTex__ sumTex, sum_step, sum_offset +#define __PASS_imgTex__ imgTex, img_step, img_offset + +#else +__inline uint read_sumTex_(image2d_t sumTex, sampler_t sam, int2 coord) { -#ifdef DISABLE_IMAGE2D - int x = clamp(round(coord.x), 0, cols - 1); - int y = clamp(round(coord.y), 0, rows - 1); - return img[elemPerRow * y + x]; -#else - return (uchar)read_imageui(img, sam, coord).x; -#endif + return read_imageui(sumTex, sam, coord).x; } +__inline uchar read_imgTex_(image2d_t imgTex, sampler_t sam, float2 coord) +{ + return (uchar)read_imageui(imgTex, sam, coord).x; +} + +#define read_sumTex(coord) read_sumTex_(sumTex, sampler, coord) +#define read_imgTex(coord) read_imgTex_(imgTex, sampler, coord) + +#define __PARAM_sumTex__ image2d_t sumTex +#define __PARAM_imgTex__ image2d_t imgTex + +#define __PASS_sumTex__ sumTex +#define __PASS_imgTex__ imgTex + +#endif + // dynamically change the precision used for floating type #if defined (DOUBLE_SUPPORT) @@ -95,7 +112,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col #endif // Image read mode -__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; #ifndef FLT_EPSILON #define FLT_EPSILON (1e-15) @@ -105,45 +122,6 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM #define CV_PI_F 3.14159265f #endif - -// Use integral image to calculate haar wavelets. -// N = 2 -// for simple haar paatern -float icvCalcHaarPatternSum_2( - IMAGE_INT32 sumTex, - __constant float2 *src, - int oldSize, - int newSize, - int y, int x, - int rows, int cols, int elemPerRow) -{ - - float ratio = (float)newSize / oldSize; - - F d = 0; - - int2 dx1 = convert_int2(round(ratio * src[0])); - int2 dy1 = convert_int2(round(ratio * src[1])); - int2 dx2 = convert_int2(round(ratio * src[2])); - int2 dy2 = convert_int2(round(ratio * src[3])); - - F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow ); - d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); - d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); - - return (float)d; -} - //////////////////////////////////////////////////////////////////////// // Hessian @@ -182,22 +160,20 @@ F calcAxisAlignedDerivative( //calculate targeted layer per-pixel determinant and trace with an integral image __kernel void SURF_calcLayerDetAndTrace( - IMAGE_INT32 sumTex, // input integral image - __global float * det, // output Determinant + __PARAM_sumTex__, // input integral image + int img_rows, int img_cols, + int c_nOctaveLayers, int c_octave, int c_layer_rows, + + __global float * det, // output determinant + int det_step, int det_offset, __global float * trace, // output trace - int det_step, // the step of det in bytes - int trace_step, // the step of trace in bytes - int c_img_rows, - int c_img_cols, - int c_nOctaveLayers, - int c_octave, - int c_layer_rows, - int sumTex_step - ) + int trace_step, int trace_offset) { det_step /= sizeof(*det); trace_step /= sizeof(*trace); - sumTex_step/= sizeof(uint); + #ifndef HAVE_IMAGE2D + sum_step/= sizeof(uint); + #endif // Determine the indices const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2); const int blockIdx_y = get_group_id(1) % gridDim_y; @@ -209,13 +185,13 @@ __kernel void SURF_calcLayerDetAndTrace( const int size = calcSize(c_octave, layer); - const int samples_i = 1 + ((c_img_rows - size) >> c_octave); - const int samples_j = 1 + ((c_img_cols - size) >> c_octave); + const int samples_i = 1 + ((img_rows - size) >> c_octave); + const int samples_j = 1 + ((img_cols - size) >> c_octave); // Ignore pixels where some of the kernel is outside the image const int margin = (size >> 1) >> c_octave; - if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) + if (size <= img_rows && size <= img_cols && i < samples_i && j < samples_j) { int x = j << c_octave; int y = i << c_octave; @@ -239,14 +215,14 @@ __kernel void SURF_calcLayerDetAndTrace( { // Some of the pixels needed to compute the derivative are // repeated, so we only don't duplicate the fetch here. - int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step ); - int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step ); - int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step ); - int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step ); - int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step ); - int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step ); - int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step ); - int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step ); + int t02 = read_sumTex( (int2)(x, y + r2)); + int t07 = read_sumTex( (int2)(x, y + r7)); + int t32 = read_sumTex( (int2)(x + r3, y + r2)); + int t37 = read_sumTex( (int2)(x + r3, y + r7)); + int t62 = read_sumTex( (int2)(x + r6, y + r2)); + int t67 = read_sumTex( (int2)(x + r6, y + r7)); + int t92 = read_sumTex( (int2)(x + r9, y + r2)); + int t97 = read_sumTex( (int2)(x + r9, y + r7)); d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2), t62, t67, t92, t97, (r9 - r6) * (r7 - r2), @@ -259,14 +235,14 @@ __kernel void SURF_calcLayerDetAndTrace( { // Some of the pixels needed to compute the derivative are // repeated, so we only don't duplicate the fetch here. - int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step ); - int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step ); - int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step ); - int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step ); - int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step ); - int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step ); - int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step ); - int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step ); + int t20 = read_sumTex( (int2)(x + r2, y) ); + int t23 = read_sumTex( (int2)(x + r2, y + r3) ); + int t70 = read_sumTex( (int2)(x + r7, y) ); + int t73 = read_sumTex( (int2)(x + r7, y + r3) ); + int t26 = read_sumTex( (int2)(x + r2, y + r6) ); + int t76 = read_sumTex( (int2)(x + r7, y + r6) ); + int t29 = read_sumTex( (int2)(x + r2, y + r9) ); + int t79 = read_sumTex( (int2)(x + r7, y + r9) ); d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3), t26, t29, t76, t79, (r7 - r2) * (r9 - r6), @@ -280,31 +256,31 @@ __kernel void SURF_calcLayerDetAndTrace( // There's no saving us here, we just have to get all of the pixels in // separate fetches F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step ); - t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step ); - t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step ); - t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( (int2)(x + r1, y + r1) ); + t -= read_sumTex( (int2)(x + r1, y + r4) ); + t -= read_sumTex( (int2)(x + r4, y + r1) ); + t += read_sumTex( (int2)(x + r4, y + r4) ); d += t / ((r4 - r1) * (r4 - r1)); t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step ); - t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step ); - t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step ); - t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( (int2)(x + r5, y + r1) ); + t -= read_sumTex( (int2)(x + r5, y + r4) ); + t -= read_sumTex( (int2)(x + r8, y + r1) ); + t += read_sumTex( (int2)(x + r8, y + r4) ); d -= t / ((r8 - r5) * (r4 - r1)); t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step ); - t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step ); - t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step ); - t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( (int2)(x + r1, y + r5) ); + t -= read_sumTex( (int2)(x + r1, y + r8) ); + t -= read_sumTex( (int2)(x + r4, y + r5) ); + t += read_sumTex( (int2)(x + r4, y + r8) ); d -= t / ((r4 - r1) * (r8 - r5)); t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step ); - t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step ); - t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step ); - t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( (int2)(x + r5, y + r5) ); + t -= read_sumTex( (int2)(x + r5, y + r8) ); + t -= read_sumTex( (int2)(x + r8, y + r5) ); + t += read_sumTex( (int2)(x + r8, y + r8) ); d += t / ((r8 - r5) * (r8 - r5)); } const float dxy = (float)d; @@ -317,171 +293,17 @@ __kernel void SURF_calcLayerDetAndTrace( //////////////////////////////////////////////////////////////////////// // NONMAX -__constant float c_DM[5] = {0, 0, 9, 9, 1}; - -bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step) -{ - float ratio = (float)size / 9.0f; - - float d = 0; - - int dx1 = round(ratio * c_DM[0]); - int dy1 = round(ratio * c_DM[1]); - int dx2 = round(ratio * c_DM[2]); - int dy2 = round(ratio * c_DM[3]); - - float t = 0; - - t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step); - t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step); - t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step); - t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step); - - d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); - - return (d >= 0.5f); -} - -// Non-maximal suppression to further filtering the candidates from previous step -__kernel -void SURF_findMaximaInLayerWithMask( - __global const float * det, - __global const float * trace, - __global int4 * maxPosBuffer, - volatile __global int* maxCounter, - int counter_offset, - int det_step, // the step of det in bytes - int trace_step, // the step of trace in bytes - int c_img_rows, - int c_img_cols, - int c_nOctaveLayers, - int c_octave, - int c_layer_rows, - int c_layer_cols, - int c_max_candidates, - float c_hessianThreshold, - IMAGE_INT32 maskSumTex, - int mask_step -) -{ - volatile __local float N9[768]; // threads.x * threads.y * 3 - - det_step /= sizeof(*det); - trace_step /= sizeof(*trace); - maxCounter += counter_offset; - mask_step /= sizeof(uint); - - // Determine the indices - const int gridDim_y = get_num_groups(1) / c_nOctaveLayers; - const int blockIdx_y = get_group_id(1) % gridDim_y; - const int blockIdx_z = get_group_id(1) / gridDim_y; - - const int layer = blockIdx_z + 1; - - const int size = calcSize(c_octave, layer); - - // Ignore pixels without a 3x3x3 neighbourhood in the layer above - const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1; - - const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1; - const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1; - - // Is this thread within the hessian buffer? - const int zoff = get_local_size(0) * get_local_size(1); - const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; - N9[localLin - zoff] = - det[det_step * - (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y - + min(max(j, 0), c_img_cols - 1)]; // x - N9[localLin ] = - det[det_step * - (c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y - + min(max(j, 0), c_img_cols - 1)]; // x - N9[localLin + zoff] = - det[det_step * - (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y - + min(max(j, 0), c_img_cols - 1)]; // x - - barrier(CLK_LOCAL_MEM_FENCE); - - if (i < c_layer_rows - margin - && j < c_layer_cols - margin - && get_local_id(0) > 0 - && get_local_id(0) < get_local_size(0) - 1 - && get_local_id(1) > 0 - && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA - ) - { - float val0 = N9[localLin]; - - if (val0 > c_hessianThreshold) - { - // Coordinates for the start of the wavelet in the sum image. There - // is some integer division involved, so don't try to simplify this - // (cancel out sampleStep) without checking the result is the same - const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; - const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; - - if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step)) - { - // Check to see if we have a max (in its 26 neighbours) - const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff] - && val0 > N9[localLin - get_local_size(0) - zoff] - && val0 > N9[localLin + 1 - get_local_size(0) - zoff] - && val0 > N9[localLin - 1 - zoff] - && val0 > N9[localLin - zoff] - && val0 > N9[localLin + 1 - zoff] - && val0 > N9[localLin - 1 + get_local_size(0) - zoff] - && val0 > N9[localLin + get_local_size(0) - zoff] - && val0 > N9[localLin + 1 + get_local_size(0) - zoff] - - && val0 > N9[localLin - 1 - get_local_size(0)] - && val0 > N9[localLin - get_local_size(0)] - && val0 > N9[localLin + 1 - get_local_size(0)] - && val0 > N9[localLin - 1 ] - && val0 > N9[localLin + 1 ] - && val0 > N9[localLin - 1 + get_local_size(0)] - && val0 > N9[localLin + get_local_size(0)] - && val0 > N9[localLin + 1 + get_local_size(0)] - - && val0 > N9[localLin - 1 - get_local_size(0) + zoff] - && val0 > N9[localLin - get_local_size(0) + zoff] - && val0 > N9[localLin + 1 - get_local_size(0) + zoff] - && val0 > N9[localLin - 1 + zoff] - && val0 > N9[localLin + zoff] - && val0 > N9[localLin + 1 + zoff] - && val0 > N9[localLin - 1 + get_local_size(0) + zoff] - && val0 > N9[localLin + get_local_size(0) + zoff] - && val0 > N9[localLin + 1 + get_local_size(0) + zoff] - ; - - if(condmax) - { - int ind = atomic_inc(maxCounter); - - if (ind < c_max_candidates) - { - const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]); - - maxPosBuffer[ind] = (int4)(j, i, layer, laplacian); - } - } - } - } - } -} - __kernel void SURF_findMaximaInLayer( __global float * det, + int det_step, int det_offset, __global float * trace, + int trace_step, int trace_offset, __global int4 * maxPosBuffer, volatile __global int* maxCounter, int counter_offset, - int det_step, // the step of det in bytes - int trace_step, // the step of trace in bytes - int c_img_rows, - int c_img_cols, + int img_rows, + int img_cols, int c_nOctaveLayers, int c_octave, int c_layer_rows, @@ -515,8 +337,8 @@ void SURF_findMaximaInLayer( const int zoff = get_local_size(0) * get_local_size(1); const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; - int l_x = min(max(j, 0), c_img_cols - 1); - int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1); + int l_x = min(max(j, 0), img_cols - 1); + int l_y = c_layer_rows * layer + min(max(i, 0), img_rows - 1); N9[localLin - zoff] = det[det_step * (l_y - c_layer_rows) + l_x]; @@ -596,7 +418,7 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x) if (det != 0) { - F invdet = 1.0 / det; + F invdet = 1.0f / det; x[0] = invdet * (b[0] * (A[1].y * A[2].z - A[1].z * A[2].y) - @@ -632,13 +454,13 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x) __kernel void SURF_interpolateKeypoint( __global const float * det, + int det_step, int det_offset, __global const int4 * maxPosBuffer, __global float * keypoints, - volatile __global int * featureCounter, - int det_step, - int keypoints_step, - int c_img_rows, - int c_img_cols, + int keypoints_step, int keypoints_offset, + volatile __global int* featureCounter, + int img_rows, + int img_cols, int c_octave, int c_layer_rows, int c_max_features @@ -730,7 +552,7 @@ void SURF_interpolateKeypoint( const int grad_wav_size = 2 * round(2.0f * s); // check when grad_wav_size is too big - if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) + if ((img_rows + 1) >= grad_wav_size && (img_cols + 1) >= grad_wav_size) { // Get a new feature index. int ind = atomic_inc(featureCounter); @@ -836,22 +658,18 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc __kernel void SURF_calcOrientation( - IMAGE_INT32 sumTex, - __global float * keypoints, - int keypoints_step, - int c_img_rows, - int c_img_cols, - int sum_step -) + __PARAM_sumTex__, int img_rows, int img_cols, + __global float * keypoints, int keypoints_step, int keypoints_offset ) { keypoints_step /= sizeof(*keypoints); + #ifndef HAVE_IMAGE2D sum_step /= sizeof(uint); + #endif __global float* featureX = keypoints + X_ROW * keypoints_step; __global float* featureY = keypoints + Y_ROW * keypoints_step; __global float* featureSize = keypoints + SIZE_ROW * keypoints_step; __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; - __local float s_X[ORI_SAMPLES]; __local float s_Y[ORI_SAMPLES]; __local float s_angle[ORI_SAMPLES]; @@ -866,7 +684,6 @@ void SURF_calcOrientation( and building the keypoint descriptor are defined relative to 's' */ const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f; - /* To find the dominant orientation, the gradients in x and y are sampled in a circle of radius 6s using wavelets of size 4s. We ensure the gradient wavelet size is even to ensure the @@ -874,7 +691,7 @@ void SURF_calcOrientation( const int grad_wav_size = 2 * round(2.0f * s); // check when grad_wav_size is too big - if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size) + if ((img_rows + 1) < grad_wav_size || (img_cols + 1) < grad_wav_size) return; // Calc X, Y, angle and store it to shared memory @@ -886,8 +703,8 @@ void SURF_calcOrientation( float ratio = (float)grad_wav_size / 4; - int r2 = round(ratio * 2.0); - int r4 = round(ratio * 4.0); + int r2 = round(ratio * 2.0f); + int r4 = round(ratio * 4.0f); for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE ) { float X = 0.0f, Y = 0.0f, angle = 0.0f; @@ -895,21 +712,20 @@ void SURF_calcOrientation( const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin); const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin); - if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && - x >= 0 && x < (c_img_cols + 1) - grad_wav_size) + if (y >= 0 && y < (img_rows + 1) - grad_wav_size && + x >= 0 && x < (img_cols + 1) - grad_wav_size) { - float apt = c_aptW[i]; // Compute the haar sum without fetching duplicate pixels. - float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step); - float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step); - float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step); - float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step); - float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step); - float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step); - float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step); - float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step); + float t00 = read_sumTex( (int2)(x, y)); + float t02 = read_sumTex( (int2)(x, y + r2)); + float t04 = read_sumTex( (int2)(x, y + r4)); + float t20 = read_sumTex( (int2)(x + r2, y)); + float t24 = read_sumTex( (int2)(x + r2, y + r4)); + float t40 = read_sumTex( (int2)(x + r4, y)); + float t42 = read_sumTex( (int2)(x + r4, y + r2)); + float t44 = read_sumTex( (int2)(x + r4, y + r4)); F t = t00 - t04 - t20 + t24; X -= t / ((r2) * (r4)); @@ -1001,7 +817,7 @@ void SURF_calcOrientation( } __kernel -void SURF_setUpright( +void SURF_setUpRight( __global float * keypoints, int keypoints_step, int keypoints_offset, int rows, int cols ) @@ -1050,22 +866,14 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] = }; // utility for linear filter -inline uchar readerGet( - IMAGE_INT8 src, - const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, - int i, int j, int rows, int cols, int elemPerRow -) -{ - float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; - float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; - return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow); -} +#define readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, i, j) \ + read_imgTex((float2)(centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir, \ + centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir)) inline float linearFilter( - IMAGE_INT8 src, - const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, - float y, float x, int rows, int cols, int elemPerRow -) + __PARAM_imgTex__, int img_rows, int img_cols, + float centerX, float centerY, float win_offset, + float cos_dir, float sin_dir, float y, float x ) { x -= 0.5f; y -= 0.5f; @@ -1077,34 +885,31 @@ inline float linearFilter( const int x2 = x1 + 1; const int y2 = y1 + 1; - uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow); + uchar src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1); out = out + src_reg * ((x2 - x) * (y2 - y)); - src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow); + src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2); out = out + src_reg * ((x - x1) * (y2 - y)); - src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow); + src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1); out = out + src_reg * ((x2 - x) * (y - y1)); - src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow); + src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2); out = out + src_reg * ((x - x1) * (y - y1)); return out; } void calc_dx_dy( - IMAGE_INT8 imgTex, + __PARAM_imgTex__, + int img_rows, int img_cols, volatile __local float *s_dx_bin, volatile __local float *s_dy_bin, volatile __local float *s_PATCH, __global const float* featureX, __global const float* featureY, __global const float* featureSize, - __global const float* featureDir, - int rows, - int cols, - int elemPerRow -) + __global const float* featureDir ) { const float centerX = featureX[get_group_id(0)]; const float centerY = featureY[get_group_id(0)]; @@ -1141,7 +946,9 @@ void calc_dx_dy( const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; - s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); + s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = + linearFilter(__PASS_imgTex__, img_rows, img_cols, centerX, centerY, + win_offset, cos_dir, sin_dir, icoo, jcoo); barrier(CLK_LOCAL_MEM_FENCE); @@ -1232,9 +1039,8 @@ void reduce_sum25( __kernel void SURF_computeDescriptors64( - IMAGE_INT8 imgTex, - int img_step, int img_offset, - int rows, int cols, + __PARAM_imgTex__, + int img_rows, int img_cols, __global const float* keypoints, int keypoints_step, int keypoints_offset, __global float * descriptors, @@ -1254,7 +1060,7 @@ void SURF_computeDescriptors64( volatile __local float sdyabs[25]; volatile __local float s_PATCH[6*6]; - calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); + calc_dx_dy(__PASS_imgTex__, img_rows, img_cols, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir); barrier(CLK_LOCAL_MEM_FENCE); const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); @@ -1286,9 +1092,8 @@ void SURF_computeDescriptors64( __kernel void SURF_computeDescriptors128( - IMAGE_INT8 imgTex, - int img_step, int img_offset, - int rows, int cols, + __PARAM_imgTex__, + int img_rows, int img_cols, __global const float* keypoints, int keypoints_step, int keypoints_offset, __global float* descriptors, @@ -1313,7 +1118,7 @@ void SURF_computeDescriptors128( volatile __local float sdabs2[25]; volatile __local float s_PATCH[6*6]; - calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); + calc_dx_dy(__PASS_imgTex__, img_rows, img_cols, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir); barrier(CLK_LOCAL_MEM_FENCE); const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); @@ -1486,7 +1291,7 @@ void reduce_sum64(volatile __local float* smem, int tid) } __kernel -void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_step) +void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_step, int descriptors_offset) { descriptors_step /= sizeof(*descriptors); // no need for thread ID @@ -1514,7 +1319,7 @@ void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_ } __kernel -void SURF_normalizeDescriptors64(__global float * descriptors, int descriptors_step) +void SURF_normalizeDescriptors64(__global float * descriptors, int descriptors_step, int descriptors_offset) { descriptors_step /= sizeof(*descriptors); // no need for thread ID diff --git a/modules/nonfree/src/surf.cpp b/modules/nonfree/src/surf.cpp index 9182916c9..cd4e5e688 100644 --- a/modules/nonfree/src/surf.cpp +++ b/modules/nonfree/src/surf.cpp @@ -902,7 +902,7 @@ void SURF::operator()(InputArray _img, InputArray _mask, bool doDescriptors = _descriptors.needed(); CV_Assert(!_img.empty() && CV_MAT_DEPTH(imgtype) == CV_8U && (imgcn == 1 || imgcn == 3 || imgcn == 4)); - CV_Assert(_descriptors.needed() && !useProvidedKeypoints); + CV_Assert(_descriptors.needed() || !useProvidedKeypoints); if( ocl::useOpenCL() ) { diff --git a/modules/nonfree/src/surf.hpp b/modules/nonfree/src/surf.hpp index b58921033..7c43f1efe 100644 --- a/modules/nonfree/src/surf.hpp +++ b/modules/nonfree/src/surf.hpp @@ -54,14 +54,11 @@ protected: bool setImage(InputArray img, InputArray mask); // kernel callers declarations - bool calcLayerDetAndTrace(UMat &det, UMat &trace, int octave, int layer_rows); + bool calcLayerDetAndTrace(int octave, int layer_rows); - bool findMaximaInLayer(const UMat &det, const UMat &trace, UMat &maxPosBuffer, - UMat &maxCounter, int counterOffset, - int octave, int layer_rows, int layer_cols); + bool findMaximaInLayer(int counterOffset, int octave, int layer_rows, int layer_cols); - bool interpolateKeypoint(const UMat &det, const UMat &maxPosBuffer, int maxCounter, - UMat &keypoints, UMat &counters, int octave, int layer_rows, int maxFeatures); + bool interpolateKeypoint(int maxCounter, UMat &keypoints, int octave, int layer_rows, int maxFeatures); bool calcOrientation(UMat &keypoints); @@ -75,7 +72,7 @@ protected: int refcount; //! max keypoints = min(keypointsRatio * img.size().area(), 65535) - UMat sum, mask1, maskSum, intBuffer; + UMat sum, intBuffer; UMat det, trace; UMat maxPosBuffer; @@ -87,12 +84,11 @@ protected: UMat img, counters; // texture buffers - ocl::Image2D imgTex, sumTex, maskSumTex; + ocl::Image2D imgTex, sumTex; bool haveImageSupport; + String kerOpts; int status; - ocl::Kernel kerCalcDetTrace, kerFindMaxima, kerFindMaximaMask, kerInterp; - ocl::Kernel kerUpRight, kerOri, kerCalcDesc64, kerCalcDesc128, kerNormDesc64, kerNormDesc128; }; /* diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index 70b4be56e..bf5db6ccb 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -54,20 +54,6 @@ namespace cv enum { ORI_SEARCH_INC=5, ORI_LOCAL_SIZE=(360 / ORI_SEARCH_INC) }; -/*static void openCLExecuteKernelSURF(Context2 *clCxt, const ProgramEntry* source, String kernelName, size_t globalThreads[3], - size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth) -{ - std::stringstream optsStr; - optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " "; - optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " "; - cl_kernel kernel; - kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str()); - size_t wave_size = queryWaveFrontSize(kernel); - CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS); - optsStr << "-D WAVE_SIZE=" << wave_size; - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str()); -}*/ - static inline int calcSize(int octave, int layer) { /* Wavelet size at first layer of first octave. */ @@ -100,22 +86,11 @@ bool SURF_OCL::init(const SURF* p) if(ocl::haveOpenCL()) { const ocl::Device& dev = ocl::Device::getDefault(); - if( dev.type() == ocl::Device::TYPE_CPU ) + if( dev.type() == ocl::Device::TYPE_CPU || dev.doubleFPConfig() == 0 ) return false; - haveImageSupport = dev.imageSupport(); - String opts = haveImageSupport ? "-D DISABLE_IMAGE2D" : ""; - - if( kerCalcDetTrace.create("SURF_calcLayerDetAndTrace", ocl::nonfree::surf_oclsrc, opts) && - kerFindMaxima.create("SURF_findMaximaInLayer", ocl::nonfree::surf_oclsrc, opts) && - kerFindMaximaMask.create("SURF_findMaximaInLayerWithMask", ocl::nonfree::surf_oclsrc, opts) && - kerInterp.create("SURF_interpolateKeypoint", ocl::nonfree::surf_oclsrc, opts) && - kerUpRight.create("SURF_setUpRight", ocl::nonfree::surf_oclsrc, opts) && - kerOri.create("SURF_calcOrientation", ocl::nonfree::surf_oclsrc, opts) && - kerCalcDesc64.create("SURF_computeDescriptors64", ocl::nonfree::surf_oclsrc, opts) && - kerCalcDesc128.create("SURF_computeDescriptors128", ocl::nonfree::surf_oclsrc, opts) && - kerNormDesc64.create("SURF_normalizeDescriptors64", ocl::nonfree::surf_oclsrc, opts) && - kerNormDesc128.create("SURF_normalizeDescriptors128", ocl::nonfree::surf_oclsrc, opts)) - status = 1; + haveImageSupport = false;//dev.imageSupport(); + kerOpts = haveImageSupport ? "-D HAVE_IMAGE2D -D DOUBLE_SUPPORT" : ""; + status = 1; } } return status > 0; @@ -126,8 +101,10 @@ bool SURF_OCL::setImage(InputArray _img, InputArray _mask) { if( status <= 0 ) return false; - CV_Assert(!_img.empty() && _img.type() == CV_8UC1); - CV_Assert(_mask.empty() || (_mask.size() == _img.size() && _mask.type() == CV_8UC1)); + if( !_mask.empty()) + return false; + int imgtype = _img.type(); + CV_Assert(!_img.empty()); CV_Assert(params && params->nOctaves > 0 && params->nOctaveLayers > 0); int min_size = calcSize(params->nOctaves - 1, 0); @@ -151,10 +128,12 @@ bool SURF_OCL::setImage(InputArray _img, InputArray _mask) counters.setTo(Scalar::all(0)); img.release(); - if(_img.isUMat()) + if(_img.isUMat() && imgtype == CV_8UC1) img = _img.getUMat(); - else + else if( imgtype == CV_8UC1 ) _img.copyTo(img); + else + cvtColor(_img, img, COLOR_BGR2GRAY); integral(img, sum); @@ -164,12 +143,6 @@ bool SURF_OCL::setImage(InputArray _img, InputArray _mask) sumTex = ocl::Image2D(sum); } - maskSumTex = ocl::Image2D(); - - if(!_mask.empty()) - { - CV_Error(Error::StsBadFunc, "Masked SURF detector is not implemented yet"); - } return true; } @@ -191,11 +164,10 @@ bool SURF_OCL::detectKeypoints(UMat &keypoints) const int layer_rows = img_rows >> octave; const int layer_cols = img_cols >> octave; - if(!calcLayerDetAndTrace(det, trace, octave, layer_rows)) + if(!calcLayerDetAndTrace(octave, layer_rows)) return false; - if(!findMaximaInLayer(det, trace, maxPosBuffer, counters, 1 + octave, octave, - layer_rows, layer_cols)) + if(!findMaximaInLayer(1 + octave, octave, layer_rows, layer_cols)) return false; cpuCounters = counters.getMat(ACCESS_READ); @@ -205,8 +177,7 @@ bool SURF_OCL::detectKeypoints(UMat &keypoints) if (maxCounter > 0) { - if(!interpolateKeypoint(det, maxPosBuffer, maxCounter, keypoints, - counters, octave, layer_rows, maxFeatures)) + if(!interpolateKeypoint(maxCounter, keypoints, octave, layer_rows, maxFeatures)) return false; } } @@ -216,7 +187,7 @@ bool SURF_OCL::detectKeypoints(UMat &keypoints) featureCounter = std::min(featureCounter, maxFeatures); cpuCounters.release(); - keypoints = UMat(keypoints, Rect(0, 0, featureCounter, 1)); + keypoints = UMat(keypoints, Rect(0, 0, featureCounter, keypoints.rows)); if (params->upright) return setUpRight(keypoints); @@ -232,7 +203,8 @@ bool SURF_OCL::setUpRight(UMat &keypoints) return true; size_t globalThreads[3] = {nFeatures, 1}; - return kerUpRight.args(ocl::KernelArg::ReadWrite(keypoints)).run(2, globalThreads, 0, false); + ocl::Kernel kerUpRight("SURF_setUpRight", ocl::nonfree::surf_oclsrc, kerOpts); + return kerUpRight.args(ocl::KernelArg::ReadWrite(keypoints)).run(2, globalThreads, 0, true); } bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptors) @@ -255,14 +227,14 @@ bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptor if( descriptorSize == 64 ) { - kerCalcDesc = kerCalcDesc64; - kerNormDesc = kerNormDesc64; + kerCalcDesc.create("SURF_computeDescriptors64", ocl::nonfree::surf_oclsrc, kerOpts); + kerNormDesc.create("SURF_normalizeDescriptors64", ocl::nonfree::surf_oclsrc, kerOpts); } else { CV_Assert(descriptorSize == 128); - kerCalcDesc = kerCalcDesc128; - kerNormDesc = kerNormDesc128; + kerCalcDesc.create("SURF_computeDescriptors128", ocl::nonfree::surf_oclsrc, kerOpts); + kerNormDesc.create("SURF_normalizeDescriptors128", ocl::nonfree::surf_oclsrc, kerOpts); } size_t localThreads[] = {6, 6}; @@ -271,17 +243,19 @@ bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptor if(haveImageSupport) { kerCalcDesc.args(imgTex, + img_rows, img_cols, ocl::KernelArg::ReadOnlyNoSize(keypoints), ocl::KernelArg::WriteOnlyNoSize(descriptors)); } else { - kerCalcDesc.args(ocl::KernelArg::ReadOnly(img), + kerCalcDesc.args(ocl::KernelArg::ReadOnlyNoSize(img), + img_rows, img_cols, ocl::KernelArg::ReadOnlyNoSize(keypoints), ocl::KernelArg::WriteOnlyNoSize(descriptors)); } - if(!kerCalcDesc.run(2, globalThreads, localThreads, false)) + if(!kerCalcDesc.run(2, globalThreads, localThreads, true)) return false; size_t localThreads_n[] = {descriptorSize, 1}; @@ -290,7 +264,7 @@ bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptor globalThreads[0] = nFeatures * localThreads[0]; globalThreads[1] = localThreads[1]; bool ok = kerNormDesc.args(ocl::KernelArg::ReadWriteNoSize(descriptors)). - run(2, globalThreads_n, localThreads_n, false); + run(2, globalThreads_n, localThreads_n, true); if(ok && !_descriptors.isUMat()) descriptors.copyTo(_descriptors); return ok; @@ -364,19 +338,19 @@ void SURF_OCL::downloadKeypoints(const UMat &keypointsGPU, std::vector } } -bool SURF_OCL::detect(InputArray img, InputArray mask, UMat& keypoints) +bool SURF_OCL::detect(InputArray _img, InputArray _mask, UMat& keypoints) { - if( !setImage(img, mask) ) + if( !setImage(_img, _mask) ) return false; return detectKeypoints(keypoints); } -bool SURF_OCL::detectAndCompute(InputArray img, InputArray mask, UMat& keypoints, +bool SURF_OCL::detectAndCompute(InputArray _img, InputArray _mask, UMat& keypoints, OutputArray _descriptors, bool useProvidedKeypoints ) { - if( !setImage(img, mask) ) + if( !setImage(_img, _mask) ) return false; if( !useProvidedKeypoints && !detectKeypoints(keypoints) ) @@ -389,22 +363,20 @@ inline int divUp(int a, int b) { return (a + b-1)/b; } //////////////////////////// // kernel caller definitions -bool SURF_OCL::calcLayerDetAndTrace(UMat &det, UMat &trace, int octave, int c_layer_rows) +bool SURF_OCL::calcLayerDetAndTrace(int octave, int c_layer_rows) { int nOctaveLayers = params->nOctaveLayers; const int min_size = calcSize(octave, 0); const int max_samples_i = 1 + ((img_rows - min_size) >> octave); const int max_samples_j = 1 + ((img_cols - min_size) >> octave); - String kernelName = "SURF_calcLayerDetAndTrace"; - std::vector< std::pair > args; - - size_t localThreads[3] = {16, 16}; - size_t globalThreads[3] = + size_t localThreads[] = {16, 16}; + size_t globalThreads[] = { divUp(max_samples_j, localThreads[0]) *localThreads[0], divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2) }; + ocl::Kernel kerCalcDetTrace("SURF_calcLayerDetAndTrace", ocl::nonfree::surf_oclsrc, kerOpts); if(haveImageSupport) { kerCalcDetTrace.args(sumTex, @@ -421,56 +393,15 @@ bool SURF_OCL::calcLayerDetAndTrace(UMat &det, UMat &trace, int octave, int c_la ocl::KernelArg::WriteOnlyNoSize(det), ocl::KernelArg::WriteOnlyNoSize(trace)); } - return kerCalcDetTrace.run(2, globalThreads, localThreads, false); + return kerCalcDetTrace.run(2, globalThreads, localThreads, true); } -bool SURF_OCL::findMaximaInLayer(const UMat &det, const UMat &trace, - UMat &maxPosBuffer, UMat &maxCounter, - int counterOffset, int octave, +bool SURF_OCL::findMaximaInLayer(int counterOffset, int octave, int layer_rows, int layer_cols) { const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1; - bool haveMask = !maskSum.empty() || (maskSumTex.ptr() != 0); int nOctaveLayers = params->nOctaveLayers; - ocl::Kernel ker; - if( haveMask ) - { - if( haveImageSupport ) - ker = kerFindMaximaMask.args(maskSumTex, - ocl::KernelArg::ReadOnlyNoSize(det), - ocl::KernelArg::ReadOnlyNoSize(trace), - ocl::KernelArg::PtrReadWrite(maxPosBuffer), - ocl::KernelArg::PtrReadWrite(maxCounter), - counterOffset, img_rows, img_cols, - octave, nOctaveLayers, - layer_rows, layer_cols, - maxCandidates, - (float)params->hessianThreshold); - else - ker = kerFindMaximaMask.args(ocl::KernelArg::ReadOnlyNoSize(maskSum), - ocl::KernelArg::ReadOnlyNoSize(det), - ocl::KernelArg::ReadOnlyNoSize(trace), - ocl::KernelArg::PtrReadWrite(maxPosBuffer), - ocl::KernelArg::PtrReadWrite(maxCounter), - counterOffset, img_rows, img_cols, - octave, nOctaveLayers, - layer_rows, layer_cols, - maxCandidates, - (float)params->hessianThreshold); - } - else - { - ker = kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det), - ocl::KernelArg::ReadOnlyNoSize(trace), - ocl::KernelArg::PtrReadWrite(maxPosBuffer), - ocl::KernelArg::PtrReadWrite(maxCounter), - counterOffset, img_rows, img_cols, - octave, nOctaveLayers, - layer_rows, layer_cols, - maxCandidates, - (float)params->hessianThreshold); - } size_t localThreads[3] = {16, 16}; size_t globalThreads[3] = { @@ -478,21 +409,31 @@ bool SURF_OCL::findMaximaInLayer(const UMat &det, const UMat &trace, divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nOctaveLayers *localThreads[1] }; - return ker.run(2, globalThreads, localThreads, false); + ocl::Kernel kerFindMaxima("SURF_findMaximaInLayer", ocl::nonfree::surf_oclsrc, kerOpts); + return kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det), + ocl::KernelArg::ReadOnlyNoSize(trace), + ocl::KernelArg::PtrReadWrite(maxPosBuffer), + ocl::KernelArg::PtrReadWrite(counters), + counterOffset, img_rows, img_cols, + octave, nOctaveLayers, + layer_rows, layer_cols, + maxCandidates, + (float)params->hessianThreshold).run(2, globalThreads, localThreads, true); } -bool SURF_OCL::interpolateKeypoint(const UMat &det, const UMat &maxPosBuffer, int maxCounter, - UMat &keypoints, UMat &counters_, int octave, int layer_rows, int max_features) +bool SURF_OCL::interpolateKeypoint(int maxCounter, UMat &keypoints, int octave, int layer_rows, int max_features) { size_t localThreads[3] = {3, 3, 3}; size_t globalThreads[3] = {maxCounter*localThreads[0], localThreads[1], 3}; + ocl::Kernel kerInterp("SURF_interpolateKeypoint", ocl::nonfree::surf_oclsrc, kerOpts); + return kerInterp.args(ocl::KernelArg::ReadOnlyNoSize(det), ocl::KernelArg::PtrReadOnly(maxPosBuffer), ocl::KernelArg::ReadWriteNoSize(keypoints), - ocl::KernelArg::PtrReadWrite(counters_), + ocl::KernelArg::PtrReadWrite(counters), img_rows, img_cols, octave, layer_rows, max_features). - run(3, globalThreads, localThreads, false); + run(3, globalThreads, localThreads, true); } bool SURF_OCL::calcOrientation(UMat &keypoints) @@ -500,18 +441,19 @@ bool SURF_OCL::calcOrientation(UMat &keypoints) int nFeatures = keypoints.cols; if( nFeatures == 0 ) return true; + ocl::Kernel kerOri("SURF_calcOrientation", ocl::nonfree::surf_oclsrc, kerOpts); + if( haveImageSupport ) - kerOri.args(sumTex, - ocl::KernelArg::ReadWriteNoSize(keypoints), - img_rows, img_cols); + kerOri.args(sumTex, img_rows, img_cols, + ocl::KernelArg::ReadWriteNoSize(keypoints)); else kerOri.args(ocl::KernelArg::ReadOnlyNoSize(sum), - ocl::KernelArg::ReadWriteNoSize(keypoints), - img_rows, img_cols); + img_rows, img_cols, + ocl::KernelArg::ReadWriteNoSize(keypoints)); size_t localThreads[3] = {ORI_LOCAL_SIZE, 1}; size_t globalThreads[3] = {nFeatures * localThreads[0], 1}; - return kerOri.run(2, globalThreads, localThreads, false); + return kerOri.run(2, globalThreads, localThreads, true); } }