Merge pull request #707 from pengx17:2.4_surf

2013-03-29 18:20:45 +04:00
parent b5dd26e4c7 f2ecf4f905
commit 56d62118d5
4 changed files with 261 additions and 59 deletions
--- a/modules/nonfree/src/opencl/surf.cl
+++ b/modules/nonfree/src/opencl/surf.cl
@@ -747,21 +747,42 @@ void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduc
 #define op(A, B) (*A)+(B)
    data[tid] = *partial_reduction;
    barrier(CLK_LOCAL_MEM_FENCE);
-
+#ifndef WAVE_SIZE
 #define WAVE_SIZE 1
 #endif
    if (tid < 16)
    {
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
 #if WAVE_SIZE < 16
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 8)
    {
 #endif
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
 #if WAVE_SIZE < 8
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 4)
    {
 #endif
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
 #if WAVE_SIZE < 4
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 2)
    {
 #endif
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
 #if WAVE_SIZE < 2
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 1)
    {
 #endif
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
    }
 #undef WAVE_SIZE
 #undef op
 }
@@ -1087,44 +1108,67 @@ void reduce_sum25(
    int tid
    )
 {
 #ifndef WAVE_SIZE
 #define WAVE_SIZE 1
 #endif
    // first step is to reduce from 25 to 16
-    if (tid < 9) // use 9 threads
+    if (tid < 9)
    {
        sdata1[tid] += sdata1[tid + 16];
        sdata2[tid] += sdata2[tid + 16];
        sdata3[tid] += sdata3[tid + 16];
        sdata4[tid] += sdata4[tid + 16];
 #if WAVE_SIZE < 16
    }
-
+    barrier(CLK_LOCAL_MEM_FENCE);
    // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
    if (tid < 8)
    {
 #endif
        sdata1[tid] += sdata1[tid + 8];
        sdata1[tid] += sdata1[tid + 4];
        sdata1[tid] += sdata1[tid + 2];
        sdata1[tid] += sdata1[tid + 1];
        sdata2[tid] += sdata2[tid + 8];
        sdata2[tid] += sdata2[tid + 4];
        sdata2[tid] += sdata2[tid + 2];
        sdata2[tid] += sdata2[tid + 1];
        sdata3[tid] += sdata3[tid + 8];
        sdata3[tid] += sdata3[tid + 4];
        sdata3[tid] += sdata3[tid + 2];
        sdata3[tid] += sdata3[tid + 1];
        sdata4[tid] += sdata4[tid + 8];
 #if WAVE_SIZE < 8
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 4)
    {
 #endif
        sdata1[tid] += sdata1[tid + 4];
        sdata2[tid] += sdata2[tid + 4];
        sdata3[tid] += sdata3[tid + 4];
        sdata4[tid] += sdata4[tid + 4];
 #if WAVE_SIZE < 4
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 2)
    {
 #endif
        sdata1[tid] += sdata1[tid + 2];
        sdata2[tid] += sdata2[tid + 2];
        sdata3[tid] += sdata3[tid + 2];
        sdata4[tid] += sdata4[tid + 2];
 #if WAVE_SIZE < 2
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 1)
    {
 #endif
        sdata1[tid] += sdata1[tid + 1];
        sdata2[tid] += sdata2[tid + 1];
        sdata3[tid] += sdata3[tid + 1];
        sdata4[tid] += sdata4[tid + 1];
    }
 #undef WAVE_SIZE
 }
 __kernel
    void compute_descriptors64(
    IMAGE_INT8 imgTex,
-    volatile __global float * descriptors,
+    __global float * descriptors,
    __global const float * keypoints,
    int descriptors_step,
    int keypoints_step,
@@ -1158,14 +1202,13 @@ __kernel
        sdyabs[tid] = fabs(sdy[tid]); // |dy| array
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 25)
+
    {
        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-    }
+ 
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 25)
    {
-        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
+        __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
        // write dx, dy, |dx|, |dy|
        if (tid == 0)
@@ -1180,7 +1223,7 @@ __kernel
 __kernel
    void compute_descriptors128(
    IMAGE_INT8 imgTex,
-    __global volatile float * descriptors,
+    __global float * descriptors,
    __global float * keypoints,
    int descriptors_step,
    int keypoints_step,
@@ -1229,13 +1272,15 @@ __kernel
            sd2[tid] = sdx[tid];
            sdabs2[tid] = fabs(sdx[tid]);
        }
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    }
    barrier(CLK_LOCAL_MEM_FENCE);
        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
    __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
    if (tid < 25)
    {
        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
        if (tid == 0)
        {
@@ -1259,11 +1304,14 @@ __kernel
            sd2[tid] = sdy[tid];
            sdabs2[tid] = fabs(sdy[tid]);
        }
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    }
    barrier(CLK_LOCAL_MEM_FENCE);
        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 25)
    {
        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
        if (tid == 0)
        {
@@ -1274,6 +1322,103 @@ __kernel
        }
    }
 }
 void reduce_sum128(volatile __local  float* smem, int tid)
 {
 #ifndef WAVE_SIZE
 #define WAVE_SIZE 1
 #endif
    if (tid < 64)
    {
        smem[tid] += smem[tid + 64];
 #if WAVE_SIZE < 64
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 32) 
    {
 #endif
        smem[tid] += smem[tid + 32];
 #if WAVE_SIZE < 32
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 16) 
    {
 #endif
        smem[tid] += smem[tid + 16];
 #if WAVE_SIZE < 16
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 8)
    {
 #endif
        smem[tid] += smem[tid + 8];
 #if WAVE_SIZE < 8
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 4)
    {
 #endif
        smem[tid] += smem[tid + 4];
 #if WAVE_SIZE < 4
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 2)
    {
 #endif
        smem[tid] += smem[tid + 2];
 #if WAVE_SIZE < 2
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 1)
    {
 #endif
        smem[tid] += smem[tid + 1];
    }
 }
 void reduce_sum64(volatile __local  float* smem, int tid)
 {
 #ifndef WAVE_SIZE
 #define WAVE_SIZE 1
 #endif
    if (tid < 32)
    {
        smem[tid] += smem[tid + 32];
 #if WAVE_SIZE < 32
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 16) 
    {
 #endif
        smem[tid] += smem[tid + 16];
 #if WAVE_SIZE < 16
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 8)
    {
 #endif
        smem[tid] += smem[tid + 8];
 #if WAVE_SIZE < 8
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 4)
    {
 #endif
        smem[tid] += smem[tid + 4];
 #if WAVE_SIZE < 4
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 2)
    {
 #endif
        smem[tid] += smem[tid + 2];
 #if WAVE_SIZE < 2
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 1)
    {
 #endif
        smem[tid] += smem[tid + 1];
    }
 }
 __kernel
    void normalize_descriptors128(__global float * descriptors, int descriptors_step)
@@ -1288,22 +1433,10 @@ __kernel
    sqDesc[get_local_id(0)] = lookup * lookup;
    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(0) < 64)
+    reduce_sum128(sqDesc, get_local_id(0));
        sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64];
    barrier(CLK_LOCAL_MEM_FENCE);
    // reduction to get total
    if (get_local_id(0) < 32)
    {
        volatile __local  float* smem = sqDesc;
        smem[get_local_id(0)] += smem[get_local_id(0) + 32];
        smem[get_local_id(0)] += smem[get_local_id(0) + 16];
        smem[get_local_id(0)] += smem[get_local_id(0) + 8];
        smem[get_local_id(0)] += smem[get_local_id(0) + 4];
        smem[get_local_id(0)] += smem[get_local_id(0) + 2];
        smem[get_local_id(0)] += smem[get_local_id(0) + 1];
    }
    // compute length (square root)
    volatile __local  float len;
@@ -1329,18 +1462,9 @@ __kernel
    sqDesc[get_local_id(0)] = lookup * lookup;
    barrier(CLK_LOCAL_MEM_FENCE);
    // reduction to get total
    if (get_local_id(0) < 32)
    {
        volatile __local  float* smem = sqDesc;
-        smem[get_local_id(0)] += smem[get_local_id(0) + 32];
+    reduce_sum64(sqDesc, get_local_id(0));
-        smem[get_local_id(0)] += smem[get_local_id(0) + 16];
+    barrier(CLK_LOCAL_MEM_FENCE);
        smem[get_local_id(0)] += smem[get_local_id(0) + 8];
        smem[get_local_id(0)] += smem[get_local_id(0) + 4];
        smem[get_local_id(0)] += smem[get_local_id(0) + 2];
        smem[get_local_id(0)] += smem[get_local_id(0) + 1];
    }
    // compute length (square root)
    volatile __local  float len;
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -43,6 +43,7 @@
 //
 //M*/
 #include "precomp.hpp"
 #include <cstdio>
 #ifdef HAVE_OPENCV_OCL
@@ -57,25 +58,35 @@ namespace cv
        ///////////////////////////OpenCL kernel strings///////////////////////////
        extern const char *surf;
-        const char* noImage2dOption = "-D DISABLE_IMAGE2D";
+        const char noImage2dOption [] = "-D DISABLE_IMAGE2D";
        static char SURF_OPTIONS [1024] = ""; 
        static bool USE_IMAGE2d = false;
        static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
            size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels, int depth)
        {
-            if(support_image2d())
+            char * pSURF_OPTIONS = SURF_OPTIONS;
            static bool OPTION_INIT = false;
            if(!OPTION_INIT)
            {
-                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth);
+                if( !USE_IMAGE2d )
            }
            else
                {
-                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption);
+                    strcat(pSURF_OPTIONS, noImage2dOption);
                    pSURF_OPTIONS += strlen(noImage2dOption);
                }
                size_t wave_size = 0;
                queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
                std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast<int>(wave_size));
                OPTION_INIT = true;
            }
            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS);
        }
    }
 }
-static inline int divUp(size_t total, size_t grain)
+static inline size_t divUp(size_t total, size_t grain)
 {
    return (total + grain - 1) / grain;
 }
@@ -151,9 +162,21 @@ public:
        integral(img, surf_.sum);
        if(support_image2d())
        {
            try
            {
                bindImgTex(img, imgTex);
                bindImgTex(surf_.sum, sumTex);
                USE_IMAGE2d = true;
            }
            catch (const cv::Exception& e)
            {
                USE_IMAGE2d = false;
                if(e.code != CL_IMAGE_FORMAT_NOT_SUPPORTED && e.code != -217)
                {
                    throw e;
                }
            }
        }
        maskSumTex = 0;
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -123,6 +123,16 @@ namespace cv
        // returns whether the current context supports image2d_t format or not
        bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
        // the enums are used to query device information
        // currently only support wavefront size queries
        enum DEVICE_INFO
        {
             WAVEFRONT_SIZE,            //in AMD speak
             WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak
        };
        //info should have been pre-allocated
        void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info);
    }//namespace ocl
 }//namespace cv
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -353,6 +353,51 @@ namespace cv
        {
            return &(Context::getContext()->impl->clCmdQueue);
        }
        void queryDeviceInfo(DEVICE_INFO info_type, void* info)
        {
            static Info::Impl* impl = Context::getContext()->impl;
            switch(info_type)
            {
            case WAVEFRONT_SIZE:
                {
 #ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
                    try
                    {
                        openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], 
                            CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(size_t), info, 0));
                    }
                    catch(const cv::Exception&)
 #elif defined (CL_DEVICE_WARP_SIZE_NV)
                    const int EXT_LEN = 4096 + 1 ;
                    char extends_set[EXT_LEN];
                    size_t extends_size;
                    openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
                    extends_set[EXT_LEN - 1] = 0;
                    if(std::string(extends_set).find("cl_nv_device_attribute_query") != std::string::npos)
                    {
                        openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], 
                            CL_DEVICE_WARP_SIZE_NV, sizeof(size_t), info, 0));
                    }
                    else
 #endif
                    {
                        // if no way left for us to query the warp size, we can get it from kernel group info
                        static const char * _kernel_string = "__kernel void test_func() {}";
                        cl_kernel kernel;
                        kernel = openCLGetKernelFromSource(Context::getContext(), &_kernel_string, "test_func");
                        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
                            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), info, NULL));
                    }
                }
                break;
            default:
                CV_Error(-1, "Invalid device info type");
                break;
            }
        }
        void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
        {
            cl_int status;