From d8e3971e7f3608b91d4f5e6467a8391b6a12edf6 Mon Sep 17 00:00:00 2001
From: niederb <thomas@niederberger.ch>
Date: Sun, 29 Nov 2015 23:25:46 +0100
Subject: [PATCH 1/5] Fixed variable importance in rtrees

---
 modules/ml/src/rtrees.cpp   | 25 +++++++++++++------------
 samples/cpp/tree_engine.cpp | 13 +++++++++++--
 2 files changed, 24 insertions(+), 14 deletions(-)
diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index 4da34992d..f1f122ebf 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -187,7 +187,7 @@ public:
                 oobidx.clear();
                 for( i = 0; i < n; i++ )
                 {
-                    if( !oobmask[i] )
+                    if( oobmask[i] )
                         oobidx.push_back(i);
                 }
                 int n_oob = (int)oobidx.size();
@@ -217,6 +217,7 @@ public:
                     else
                     {
                         int ival = cvRound(val);
+                        //Voting scheme to combine OOB errors of each tree
                         int* votes = &oobvotes[j*nclasses];
                         votes[ival]++;
                         int best_class = 0;
@@ -235,35 +236,35 @@ public:
                     oobperm.resize(n_oob);
                     for( i = 0; i < n_oob; i++ )
                         oobperm[i] = oobidx[i];
+                    for (i = n_oob - 1; i > 0; --i)  //Randomly shuffle indices so we can permute features
+                    {
+                        int r_i = rng.uniform(0, i + 1);
+                        std::swap(oobperm[i], oobperm[r_i]);
+                    }
 
                     for( vi_ = 0; vi_ < nvars; vi_++ )
                     {
-                        vi = vidx ? vidx[vi_] : vi_;
+                        vi = vidx ? vidx[vi_] : vi_; //Ensure that only the user specified predictors are used for training
                         double ncorrect_responses_permuted = 0;
-                        for( i = 0; i < n_oob; i++ )
-                        {
-                            int i1 = rng.uniform(0, n_oob);
-                            int i2 = rng.uniform(0, n_oob);
-                            std::swap(i1, i2);
-                        }
 
                         for( i = 0; i < n_oob; i++ )
                         {
                             j = oobidx[i];
                             int vj = oobperm[i];
                             sample0 = Mat( nallvars, 1, CV_32F, psamples + sstep0*w->sidx[j], sstep1*sizeof(psamples[0]) );
-                            for( k = 0; k < nallvars; k++ )
-                                sample.at<float>(k) = sample0.at<float>(k);
-                            sample.at<float>(vi) = psamples[sstep0*w->sidx[vj] + sstep1*vi];
+                            Mat sample_clone = sample0.clone(); //create a copy so we don't mess up the original data
+                            sample_clone.at<float>(vi) = psamples[sstep0*w->sidx[vj] + sstep1*vi];
 
-                            double val = predictTrees(Range(treeidx, treeidx+1), sample, predictFlags);
+                            double val = predictTrees(Range(treeidx, treeidx+1), sample_clone, predictFlags);
                             if( !_isClassifier )
                             {
                                 val = (val - w->ord_responses[w->sidx[j]])/max_response;
                                 ncorrect_responses_permuted += exp( -val*val );
                             }
                             else
+                            {
                                 ncorrect_responses_permuted += cvRound(val) == w->cat_responses[w->sidx[j]];
+                            }
                         }
                         varImportance[vi] += (float)(ncorrect_responses - ncorrect_responses_permuted);
                     }
diff --git a/samples/cpp/tree_engine.cpp b/samples/cpp/tree_engine.cpp
index 2d6824d24..d9fbb9678 100644
--- a/samples/cpp/tree_engine.cpp
+++ b/samples/cpp/tree_engine.cpp
@@ -63,7 +63,6 @@ int main(int argc, char** argv)
     const double train_test_split_ratio = 0.5;
 
     Ptr<TrainData> data = TrainData::loadFromCSV(filename, 0, response_idx, response_idx+1, typespec);
-
     if( data.empty() )
     {
         printf("ERROR: File %s can not be read\n", filename);
@@ -71,6 +70,7 @@ int main(int argc, char** argv)
     }
 
     data->setTrainTestSplitRatio(train_test_split_ratio);
+    std::cout << "Test/Train: " << data->getNTestSamples() << "/" << data->getNTrainSamples();
 
     printf("======DTREE=====\n");
     Ptr<DTrees> dtree = DTrees::create();
@@ -106,10 +106,19 @@ int main(int argc, char** argv)
     rtrees->setUseSurrogates(false);
     rtrees->setMaxCategories(16);
     rtrees->setPriors(Mat());
-    rtrees->setCalculateVarImportance(false);
+    rtrees->setCalculateVarImportance(true);
     rtrees->setActiveVarCount(0);
     rtrees->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER, 100, 0));
     train_and_print_errs(rtrees, data);
+    cv::Mat ref_labels = data->getClassLabels();
+    cv::Mat test_data = data->getTestSampleIdx();
+    cv::Mat predict_labels;
+    rtrees->predict(data->getSamples(), predict_labels);
 
+    cv::Mat variable_importance = rtrees->getVarImportance();
+    std::cout << "Estimated variable importance" << std::endl;
+    for (int i = 0; i < variable_importance.rows; i++) {
+        std::cout << "Variable " << i << ": " << variable_importance.at<float>(i, 0) << std::endl;
+    }
     return 0;
 }

From fda17273dec4f2a8768ae58a4fc507fe87d2ae87 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 10 Dec 2015 17:41:05 +0300
Subject: [PATCH 2/5] applying patch by rxtsolar:
 https://github.com/Itseez/opencv/pull/5422 for the master branch (even though
 it's actually not that important here)

---
 modules/ml/src/svm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 402de3f1d..0fd73a389 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1822,8 +1822,8 @@ public:
             }
         }
 
-        params = best_params;
         class_labels = class_labels0;
+        setParams(best_params);
         return do_train( samples, responses );
     }
 

From 544990e3771e1300a964fcb77a493c5d6a1d737b Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 10 Dec 2015 18:13:54 +0300
Subject: [PATCH 3/5] couple of small fixes in rtrees variable importance
 calculation

---
 modules/ml/src/rtrees.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index f1f122ebf..1c9120a6d 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -233,12 +233,13 @@ public:
                 oobError /= n_oob;
                 if( rparams.calcVarImportance && n_oob > 1 )
                 {
+                    Mat sample_clone;
                     oobperm.resize(n_oob);
                     for( i = 0; i < n_oob; i++ )
                         oobperm[i] = oobidx[i];
                     for (i = n_oob - 1; i > 0; --i)  //Randomly shuffle indices so we can permute features
                     {
-                        int r_i = rng.uniform(0, i + 1);
+                        int r_i = rng.uniform(0, n_oob);
                         std::swap(oobperm[i], oobperm[r_i]);
                     }
 
@@ -252,7 +253,7 @@ public:
                             j = oobidx[i];
                             int vj = oobperm[i];
                             sample0 = Mat( nallvars, 1, CV_32F, psamples + sstep0*w->sidx[j], sstep1*sizeof(psamples[0]) );
-                            Mat sample_clone = sample0.clone(); //create a copy so we don't mess up the original data
+                            sample0.copyTo(sample_clone); //create a copy so we don't mess up the original data
                             sample_clone.at<float>(vi) = psamples[sstep0*w->sidx[vj] + sstep1*vi];
 
                             double val = predictTrees(Range(treeidx, treeidx+1), sample_clone, predictFlags);

From 0d706f679647ee27c1f3e9b8ef60fb0d4c5204ba Mon Sep 17 00:00:00 2001
From: Deanna Hood <deanna.m.hood@gmail.com>
Date: Sat, 18 Apr 2015 21:32:29 -0400
Subject: [PATCH 4/5] Return uncompressed support vectors for getSupportVectors
 on linear SVM (Bug #4096)

---
 modules/ml/include/opencv2/ml.hpp             | 10 +++-
 modules/ml/src/svm.cpp                        | 54 +++++++++++++++++--
 modules/ml/test/test_svmtrainauto.cpp         | 48 +++++++++++++++++
 .../introduction_to_svm.cpp                   |  2 +-
 4 files changed, 109 insertions(+), 5 deletions(-)

diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp
index 715cbd998..862f3f950 100644
--- a/modules/ml/include/opencv2/ml.hpp
+++ b/modules/ml/include/opencv2/ml.hpp
@@ -675,11 +675,19 @@ public:
 
     /** @brief Retrieves all the support vectors
 
-    The method returns all the support vector as floating-point matrix, where support vectors are
+    The method returns all the support vectors as a floating-point matrix, where support vectors are
     stored as matrix rows.
      */
     CV_WRAP virtual Mat getSupportVectors() const = 0;
 
+    /** @brief Retrieves all the uncompressed support vectors of a linear %SVM
+
+    The method returns all the uncompressed support vectors of a linear %SVM that the compressed
+    support vector, used for prediction, was derived from. They are returned in a floating-point
+    matrix, where the support vectors are stored as matrix rows.
+     */
+    CV_WRAP Mat getUncompressedSupportVectors() const;
+
     /** @brief Retrieves the decision function
 
     @param i the index of the decision function. If the problem solved is regression, 1-class or
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 0fd73a389..757bb7a17 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1241,6 +1241,12 @@ public:
         df_alpha.clear();
         df_index.clear();
         sv.release();
+        uncompressed_sv.release();
+    }
+
+    Mat getUncompressedSupportVectors_() const
+    {
+        return uncompressed_sv;
     }
 
     Mat getSupportVectors() const
@@ -1538,6 +1544,7 @@ public:
         }
 
         optimize_linear_svm();
+
         return true;
     }
 
@@ -1588,6 +1595,7 @@ public:
 
         setRangeVector(df_index, df_count);
         df_alpha.assign(df_count, 1.);
+        sv.copyTo(uncompressed_sv);
         std::swap(sv, new_sv);
         std::swap(decision_func, new_df);
     }
@@ -2056,6 +2064,21 @@ public:
         }
         fs << "]";
 
+        if ( !uncompressed_sv.empty() )
+        {
+            // write the joint collection of uncompressed support vectors
+            int uncompressed_sv_total = uncompressed_sv.rows;
+            fs << "uncompressed_sv_total" << uncompressed_sv_total;
+            fs << "uncompressed_support_vectors" << "[";
+            for( i = 0; i < uncompressed_sv_total; i++ )
+            {
+                fs << "[:";
+                fs.writeRaw("f", uncompressed_sv.ptr(i), uncompressed_sv.cols*uncompressed_sv.elemSize());
+                fs << "]";
+            }
+            fs << "]";
+        }
+
         // write decision functions
         int df_count = (int)decision_func.size();
 
@@ -2096,7 +2119,7 @@ public:
             svm_type_str == "NU_SVR" ? NU_SVR : -1;
 
         if( svmType < 0 )
-            CV_Error( CV_StsParseError, "Missing of invalid SVM type" );
+            CV_Error( CV_StsParseError, "Missing or invalid SVM type" );
 
         FileNode kernel_node = fn["kernel"];
         if( kernel_node.empty() )
@@ -2168,14 +2191,31 @@ public:
         FileNode sv_node = fn["support_vectors"];
 
         CV_Assert((int)sv_node.size() == sv_total);
-        sv.create(sv_total, var_count, CV_32F);
 
+        sv.create(sv_total, var_count, CV_32F);
         FileNodeIterator sv_it = sv_node.begin();
         for( i = 0; i < sv_total; i++, ++sv_it )
         {
             (*sv_it).readRaw("f", sv.ptr(i), var_count*sv.elemSize());
         }
 
+        int uncompressed_sv_total = (int)fn["uncompressed_sv_total"];
+
+        if( uncompressed_sv_total > 0 )
+        {
+            // read uncompressed support vectors
+            FileNode uncompressed_sv_node = fn["uncompressed_support_vectors"];
+
+            CV_Assert((int)uncompressed_sv_node.size() == uncompressed_sv_total);
+            uncompressed_sv.create(uncompressed_sv_total, var_count, CV_32F);
+
+            FileNodeIterator uncompressed_sv_it = uncompressed_sv_node.begin();
+            for( i = 0; i < uncompressed_sv_total; i++, ++uncompressed_sv_it )
+            {
+                (*uncompressed_sv_it).readRaw("f", uncompressed_sv.ptr(i), var_count*uncompressed_sv.elemSize());
+            }
+        }
+
         // read decision functions
         int df_count = class_count > 1 ? class_count*(class_count-1)/2 : 1;
         FileNode df_node = fn["decision_functions"];
@@ -2207,7 +2247,7 @@ public:
     SvmParams params;
     Mat class_labels;
     int var_count;
-    Mat sv;
+    Mat sv, uncompressed_sv;
     vector<DecisionFunc> decision_func;
     vector<double> df_alpha;
     vector<int> df_index;
@@ -2221,6 +2261,14 @@ Ptr<SVM> SVM::create()
     return makePtr<SVMImpl>();
 }
 
+Mat SVM::getUncompressedSupportVectors() const
+{
+    const SVMImpl* this_ = dynamic_cast<const SVMImpl*>(this);
+    if(!this_)
+        CV_Error(Error::StsNotImplemented, "the class is not SVMImpl");
+    return this_->getUncompressedSupportVectors_();
+}
+
 }
 }
 
diff --git a/modules/ml/test/test_svmtrainauto.cpp b/modules/ml/test/test_svmtrainauto.cpp
index 3c4b72924..13cbe98f4 100644
--- a/modules/ml/test/test_svmtrainauto.cpp
+++ b/modules/ml/test/test_svmtrainauto.cpp
@@ -118,3 +118,51 @@ TEST(ML_SVM, trainAuto_regression_5369)
     EXPECT_EQ(0., result0);
     EXPECT_EQ(1., result1);
 }
+
+class CV_SVMGetSupportVectorsTest : public cvtest::BaseTest {
+public:
+    CV_SVMGetSupportVectorsTest() {}
+protected:
+    virtual void run( int startFrom );
+};
+void CV_SVMGetSupportVectorsTest::run(int /*startFrom*/ )
+{
+    int code = cvtest::TS::OK;
+
+    // Set up training data
+    int labels[4] = {1, -1, -1, -1};
+    float trainingData[4][2] = { {501, 10}, {255, 10}, {501, 255}, {10, 501} };
+    Mat trainingDataMat(4, 2, CV_32FC1, trainingData);
+    Mat labelsMat(4, 1, CV_32SC1, labels);
+
+    Ptr<SVM> svm = SVM::create();
+    svm->setType(SVM::C_SVC);
+    svm->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER, 100, 1e-6));
+
+
+    // Test retrieval of SVs and compressed SVs on linear SVM
+    svm->setKernel(SVM::LINEAR);
+    svm->train(trainingDataMat, cv::ml::ROW_SAMPLE, labelsMat);
+
+    Mat sv = svm->getSupportVectors();
+    CV_Assert(sv.rows == 1);    // by default compressed SV returned
+    sv = svm->getUncompressedSupportVectors();
+    CV_Assert(sv.rows == 3);
+
+
+    // Test retrieval of SVs and compressed SVs on non-linear SVM
+    svm->setKernel(SVM::POLY);
+    svm->setDegree(2);
+    svm->train(trainingDataMat, cv::ml::ROW_SAMPLE, labelsMat);
+
+    sv = svm->getSupportVectors();
+    CV_Assert(sv.rows == 3);
+    sv = svm->getUncompressedSupportVectors();
+    CV_Assert(sv.rows == 0);    // inapplicable for non-linear SVMs
+
+
+    ts->set_failed_test_info(code);
+}
+
+
+TEST(ML_SVM, getSupportVectors) { CV_SVMGetSupportVectorsTest test; test.safe_run(); }
diff --git a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
index 0513e367d..9b0d569c6 100644
--- a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+++ b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
@@ -65,7 +65,7 @@ int main(int, char**)
     //! [show_vectors]
     thickness = 2;
     lineType  = 8;
-    Mat sv = svm->getSupportVectors();
+    Mat sv = svm->getUncompressedSupportVectors();
 
     for (int i = 0; i < sv.rows; ++i)
     {

From 7a7b0bcfcbe2847708086637506acba8f3ecd832 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 10 Dec 2015 20:17:17 +0300
Subject: [PATCH 5/5] fixed the upper boundary when calling checkRange (thanks
 to alalek)

---
 modules/ml/src/data.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ml/src/data.cpp b/modules/ml/src/data.cpp
index feb79d93a..a1608e398 100644
--- a/modules/ml/src/data.cpp
+++ b/modules/ml/src/data.cpp
@@ -253,7 +253,7 @@ public:
         if( !sampleIdx.empty() )
         {
             CV_Assert( (sampleIdx.checkVector(1, CV_32S, true) > 0 &&
-                       checkRange(sampleIdx, true, 0, 0, nsamples-1)) ||
+                       checkRange(sampleIdx, true, 0, 0, nsamples)) ||
                        sampleIdx.checkVector(1, CV_8U, true) == nsamples );
             if( sampleIdx.type() == CV_8U )
                 sampleIdx = convertMaskToIdx(sampleIdx);