From 113b7584e062388be1b9b8628d1e36b3208ebf2b Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Fri, 12 Apr 2013 16:50:30 +0800
Subject: [PATCH] Optimize bfmatcher by passing macros.

---
 modules/ocl/src/brute_force_matcher.cpp     | 39 +++++----
 modules/ocl/src/opencl/brute_force_match.cl | 91 +++++++++++++++------
 2 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp
index 5656e926b..ee0989d64 100644
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -74,6 +74,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
     int m_size = MAX_DESC_LEN;
     vector< pair<size_t, const void *> > args;
 
+    static const int OPT_SIZE = 40;
+    char opt [OPT_SIZE] = "";
+    sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
     if(globalSize[0] != 0)
     {
         args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@@ -82,8 +85,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
         args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
         args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
         args.push_back( make_pair( smemSize, (void *)NULL));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@@ -93,7 +94,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
 
         std::string kernelName = "BruteForceMatch_UnrollMatch";
 
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
     }
 }
 
@@ -115,6 +116,9 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
     int block_size = BLOCK_SIZE;
     vector< pair<size_t, const void *> > args;
 
+    static const int OPT_SIZE = 40;
+    char opt [OPT_SIZE] = "";
+    sprintf(opt, "-D block_size=%d", block_size);
     if(globalSize[0] != 0)
     {
         args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@@ -123,7 +127,6 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
         args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
         args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
         args.push_back( make_pair( smemSize, (void *)NULL));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@@ -133,7 +136,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
 
         std::string kernelName = "BruteForceMatch_Match";
 
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
     }
 }
 
@@ -157,6 +160,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
     int m_size = MAX_DESC_LEN;
     vector< pair<size_t, const void *> > args;
 
+    static const int OPT_SIZE = 40;
+    char opt [OPT_SIZE] = "";
+    sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
     if(globalSize[0] != 0)
     {
         args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@@ -167,8 +173,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
         args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
         args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
         args.push_back( make_pair( smemSize, (void *)NULL));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@@ -180,7 +184,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
 
         std::string kernelName = "BruteForceMatch_RadiusUnrollMatch";
 
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
     }
 }
 
@@ -197,6 +201,9 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
     int block_size = BLOCK_SIZE;
     vector< pair<size_t, const void *> > args;
 
+    static const int OPT_SIZE = 40;
+    char opt [OPT_SIZE] = "";
+    sprintf(opt, "-D block_size=%d", block_size);
     if(globalSize[0] != 0)
     {
         args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@@ -207,7 +214,6 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
         args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
         args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
         args.push_back( make_pair( smemSize, (void *)NULL));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@@ -219,7 +225,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
 
         std::string kernelName = "BruteForceMatch_RadiusMatch";
 
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
     }
 }
 
@@ -294,6 +300,9 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
     int m_size = MAX_DESC_LEN;
     vector< pair<size_t, const void *> > args;
 
+    static const int OPT_SIZE = 40;
+    char opt [OPT_SIZE] = "";
+    sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
     if(globalSize[0] != 0)
     {
         args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@@ -302,8 +311,6 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
         args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
         args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
         args.push_back( make_pair( smemSize, (void *)NULL));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@@ -313,7 +320,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
 
         std::string kernelName = "BruteForceMatch_knnUnrollMatch";
 
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
     }
 }
 
@@ -328,6 +335,9 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
     int block_size = BLOCK_SIZE;
     vector< pair<size_t, const void *> > args;
 
+    static const int OPT_SIZE = 40;
+    char opt [OPT_SIZE] = "";
+    sprintf(opt, "-D block_size=%d", block_size);
     if(globalSize[0] != 0)
     {
         args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
@@ -336,7 +346,6 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
         args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
         args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
         args.push_back( make_pair( smemSize, (void *)NULL));
-        args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
@@ -346,7 +355,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
 
         std::string kernelName = "BruteForceMatch_knnMatch";
 
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
     }
 }
 
diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl
index e76fb1d21..7821920dc 100644
--- a/modules/ocl/src/opencl/brute_force_match.cl
+++ b/modules/ocl/src/opencl/brute_force_match.cl
@@ -1,5 +1,58 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#define MAX_FLOAT 1e7f
+#define MAX_FLOAT 3.40282e+038f
+
+#ifndef block_size
+#define block_size 16
+#endif
+#ifndef max_desc_len
+#define max_desc_len 64
+#endif
 
 int bit1Count(float x)
 {
@@ -15,7 +68,6 @@ int bit1Count(float x)
 
 float reduce_block(__local float *s_query,
                    __local float *s_train,
-                   int block_size,
                    int lidx,
                    int lidy,
                    int distType
@@ -51,8 +103,6 @@ float reduce_block(__local float *s_query,
 
 float reduce_multi_block(__local float *s_query,
                          __local float *s_train,
-                         int max_desc_len,
-                         int block_size,
                          int block_index,
                          int lidx,
                          int lidy,
@@ -98,8 +148,6 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
     __global int *bestTrainIdx,
     __global float *bestDistance,
     __local float *sharebuffer,
-    int block_size,
-    int max_desc_len,
     int query_rows,
     int query_cols,
     int train_rows,
@@ -108,6 +156,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
     int distType
 )
 {
+
     const int lidx = get_local_id(0);
     const int lidy = get_local_id(1);
     const int groupidx = get_group_id(0);
@@ -117,6 +166,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
 
     int queryIdx = groupidx * block_size + lidy;
     // load the query into local memory.
+    #pragma unroll
     for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
     {
         int loadx = lidx + i * block_size;
@@ -128,9 +178,10 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
 
     // loopUnrolledCached to find the best trainIdx and best distance.
     volatile int imgIdx = 0;
-    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+    for (int t = 0, endt = (train_rows + block_size - 1) / block_size; t < endt; t++)
     {
         float result = 0;
+        #pragma unroll
         for (int i = 0 ; i < max_desc_len / block_size ; i++)
         {
             //load a block_size * block_size block into local train.
@@ -140,7 +191,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
             //synchronize to make sure each elem for reduceIteration in share memory is written already.
             barrier(CLK_LOCAL_MEM_FENCE);
 
-            result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);
+            result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType);
 
             barrier(CLK_LOCAL_MEM_FENCE);
         }
@@ -168,6 +219,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
     barrier(CLK_LOCAL_MEM_FENCE);
 
     //reduce -- now all reduce implement in each threads.
+    #pragma unroll
     for (int k = 0 ; k < block_size; k++)
     {
         if (myBestDistance > s_distance[k])
@@ -191,7 +243,6 @@ __kernel void BruteForceMatch_Match_D5(
     __global int *bestTrainIdx,
     __global float *bestDistance,
     __local float *sharebuffer,
-    int block_size,
     int query_rows,
     int query_cols,
     int train_rows,
@@ -232,7 +283,7 @@ __kernel void BruteForceMatch_Match_D5(
 
             barrier(CLK_LOCAL_MEM_FENCE);
 
-            result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
+            result += reduce_block(s_query, s_train, lidx, lidy, distType);
 
             barrier(CLK_LOCAL_MEM_FENCE);
         }
@@ -287,8 +338,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
     __global float *bestDistance,
     __global int *nMatches,
     __local float *sharebuffer,
-    int block_size,
-    int max_desc_len,
     int query_rows,
     int query_cols,
     int train_rows,
@@ -322,7 +371,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
         //synchronize to make sure each elem for reduceIteration in share memory is written already.
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
+        result += reduce_block(s_query, s_train, lidx, lidy, distType);
 
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -350,7 +399,6 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
     __global float *bestDistance,
     __global int *nMatches,
     __local float *sharebuffer,
-    int block_size,
     int query_rows,
     int query_cols,
     int train_rows,
@@ -384,7 +432,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
         //synchronize to make sure each elem for reduceIteration in share memory is written already.
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
+        result += reduce_block(s_query, s_train, lidx, lidy, distType);
 
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -410,8 +458,6 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
     __global int2 *bestTrainIdx,
     __global float2 *bestDistance,
     __local float *sharebuffer,
-    int block_size,
-    int max_desc_len,
     int query_rows,
     int query_cols,
     int train_rows,
@@ -455,7 +501,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
             //synchronize to make sure each elem for reduceIteration in share memory is written already.
             barrier(CLK_LOCAL_MEM_FENCE);
 
-            result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);
+            result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType);
 
             barrier(CLK_LOCAL_MEM_FENCE);
         }
@@ -559,7 +605,6 @@ __kernel void BruteForceMatch_knnMatch_D5(
     __global int2 *bestTrainIdx,
     __global float2 *bestDistance,
     __local float *sharebuffer,
-    int block_size,
     int query_rows,
     int query_cols,
     int train_rows,
@@ -600,7 +645,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
 
             barrier(CLK_LOCAL_MEM_FENCE);
 
-            result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
+            result += reduce_block(s_query, s_train, lidx, lidy, distType);
 
             barrier(CLK_LOCAL_MEM_FENCE);
         }
@@ -703,8 +748,6 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5(
     //__global float *mask,
     __global float *allDist,
     __local float *sharebuffer,
-    int block_size,
-    int max_desc_len,
     int query_rows,
     int query_cols,
     int train_rows,
@@ -721,7 +764,6 @@ kernel void BruteForceMatch_calcDistance_D5(
     //__global float *mask,
     __global float *allDist,
     __local float *sharebuffer,
-    int block_size,
     int query_rows,
     int query_cols,
     int train_rows,
@@ -736,8 +778,7 @@ kernel void BruteForceMatch_findBestMatch_D5(
     __global float *allDist,
     __global int *bestTrainIdx,
     __global float *bestDistance,
-    int k,
-    int block_size
+    int k
 )
 {
     /* Todo */