From d8e3971e7f3608b91d4f5e6467a8391b6a12edf6 Mon Sep 17 00:00:00 2001 From: niederb Date: Sun, 29 Nov 2015 23:25:46 +0100 Subject: [PATCH 1/5] Fixed variable importance in rtrees --- modules/ml/src/rtrees.cpp | 25 +++++++++++++------------ samples/cpp/tree_engine.cpp | 13 +++++++++++-- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp index 4da34992d..f1f122ebf 100644 --- a/modules/ml/src/rtrees.cpp +++ b/modules/ml/src/rtrees.cpp @@ -187,7 +187,7 @@ public: oobidx.clear(); for( i = 0; i < n; i++ ) { - if( !oobmask[i] ) + if( oobmask[i] ) oobidx.push_back(i); } int n_oob = (int)oobidx.size(); @@ -217,6 +217,7 @@ public: else { int ival = cvRound(val); + //Voting scheme to combine OOB errors of each tree int* votes = &oobvotes[j*nclasses]; votes[ival]++; int best_class = 0; @@ -235,35 +236,35 @@ public: oobperm.resize(n_oob); for( i = 0; i < n_oob; i++ ) oobperm[i] = oobidx[i]; + for (i = n_oob - 1; i > 0; --i) //Randomly shuffle indices so we can permute features + { + int r_i = rng.uniform(0, i + 1); + std::swap(oobperm[i], oobperm[r_i]); + } for( vi_ = 0; vi_ < nvars; vi_++ ) { - vi = vidx ? vidx[vi_] : vi_; + vi = vidx ? vidx[vi_] : vi_; //Ensure that only the user specified predictors are used for training double ncorrect_responses_permuted = 0; - for( i = 0; i < n_oob; i++ ) - { - int i1 = rng.uniform(0, n_oob); - int i2 = rng.uniform(0, n_oob); - std::swap(i1, i2); - } for( i = 0; i < n_oob; i++ ) { j = oobidx[i]; int vj = oobperm[i]; sample0 = Mat( nallvars, 1, CV_32F, psamples + sstep0*w->sidx[j], sstep1*sizeof(psamples[0]) ); - for( k = 0; k < nallvars; k++ ) - sample.at(k) = sample0.at(k); - sample.at(vi) = psamples[sstep0*w->sidx[vj] + sstep1*vi]; + Mat sample_clone = sample0.clone(); //create a copy so we don't mess up the original data + sample_clone.at(vi) = psamples[sstep0*w->sidx[vj] + sstep1*vi]; - double val = predictTrees(Range(treeidx, treeidx+1), sample, predictFlags); + double val = predictTrees(Range(treeidx, treeidx+1), sample_clone, predictFlags); if( !_isClassifier ) { val = (val - w->ord_responses[w->sidx[j]])/max_response; ncorrect_responses_permuted += exp( -val*val ); } else + { ncorrect_responses_permuted += cvRound(val) == w->cat_responses[w->sidx[j]]; + } } varImportance[vi] += (float)(ncorrect_responses - ncorrect_responses_permuted); } diff --git a/samples/cpp/tree_engine.cpp b/samples/cpp/tree_engine.cpp index 2d6824d24..d9fbb9678 100644 --- a/samples/cpp/tree_engine.cpp +++ b/samples/cpp/tree_engine.cpp @@ -63,7 +63,6 @@ int main(int argc, char** argv) const double train_test_split_ratio = 0.5; Ptr data = TrainData::loadFromCSV(filename, 0, response_idx, response_idx+1, typespec); - if( data.empty() ) { printf("ERROR: File %s can not be read\n", filename); @@ -71,6 +70,7 @@ int main(int argc, char** argv) } data->setTrainTestSplitRatio(train_test_split_ratio); + std::cout << "Test/Train: " << data->getNTestSamples() << "/" << data->getNTrainSamples(); printf("======DTREE=====\n"); Ptr dtree = DTrees::create(); @@ -106,10 +106,19 @@ int main(int argc, char** argv) rtrees->setUseSurrogates(false); rtrees->setMaxCategories(16); rtrees->setPriors(Mat()); - rtrees->setCalculateVarImportance(false); + rtrees->setCalculateVarImportance(true); rtrees->setActiveVarCount(0); rtrees->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER, 100, 0)); train_and_print_errs(rtrees, data); + cv::Mat ref_labels = data->getClassLabels(); + cv::Mat test_data = data->getTestSampleIdx(); + cv::Mat predict_labels; + rtrees->predict(data->getSamples(), predict_labels); + cv::Mat variable_importance = rtrees->getVarImportance(); + std::cout << "Estimated variable importance" << std::endl; + for (int i = 0; i < variable_importance.rows; i++) { + std::cout << "Variable " << i << ": " << variable_importance.at(i, 0) << std::endl; + } return 0; } From fda17273dec4f2a8768ae58a4fc507fe87d2ae87 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 10 Dec 2015 17:41:05 +0300 Subject: [PATCH 2/5] applying patch by rxtsolar: https://github.com/Itseez/opencv/pull/5422 for the master branch (even though it's actually not that important here) --- modules/ml/src/svm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp index 402de3f1d..0fd73a389 100644 --- a/modules/ml/src/svm.cpp +++ b/modules/ml/src/svm.cpp @@ -1822,8 +1822,8 @@ public: } } - params = best_params; class_labels = class_labels0; + setParams(best_params); return do_train( samples, responses ); } From 544990e3771e1300a964fcb77a493c5d6a1d737b Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 10 Dec 2015 18:13:54 +0300 Subject: [PATCH 3/5] couple of small fixes in rtrees variable importance calculation --- modules/ml/src/rtrees.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp index f1f122ebf..1c9120a6d 100644 --- a/modules/ml/src/rtrees.cpp +++ b/modules/ml/src/rtrees.cpp @@ -233,12 +233,13 @@ public: oobError /= n_oob; if( rparams.calcVarImportance && n_oob > 1 ) { + Mat sample_clone; oobperm.resize(n_oob); for( i = 0; i < n_oob; i++ ) oobperm[i] = oobidx[i]; for (i = n_oob - 1; i > 0; --i) //Randomly shuffle indices so we can permute features { - int r_i = rng.uniform(0, i + 1); + int r_i = rng.uniform(0, n_oob); std::swap(oobperm[i], oobperm[r_i]); } @@ -252,7 +253,7 @@ public: j = oobidx[i]; int vj = oobperm[i]; sample0 = Mat( nallvars, 1, CV_32F, psamples + sstep0*w->sidx[j], sstep1*sizeof(psamples[0]) ); - Mat sample_clone = sample0.clone(); //create a copy so we don't mess up the original data + sample0.copyTo(sample_clone); //create a copy so we don't mess up the original data sample_clone.at(vi) = psamples[sstep0*w->sidx[vj] + sstep1*vi]; double val = predictTrees(Range(treeidx, treeidx+1), sample_clone, predictFlags); From 0d706f679647ee27c1f3e9b8ef60fb0d4c5204ba Mon Sep 17 00:00:00 2001 From: Deanna Hood Date: Sat, 18 Apr 2015 21:32:29 -0400 Subject: [PATCH 4/5] Return uncompressed support vectors for getSupportVectors on linear SVM (Bug #4096) --- modules/ml/include/opencv2/ml.hpp | 10 +++- modules/ml/src/svm.cpp | 54 +++++++++++++++++-- modules/ml/test/test_svmtrainauto.cpp | 48 +++++++++++++++++ .../introduction_to_svm.cpp | 2 +- 4 files changed, 109 insertions(+), 5 deletions(-) diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp index 715cbd998..862f3f950 100644 --- a/modules/ml/include/opencv2/ml.hpp +++ b/modules/ml/include/opencv2/ml.hpp @@ -675,11 +675,19 @@ public: /** @brief Retrieves all the support vectors - The method returns all the support vector as floating-point matrix, where support vectors are + The method returns all the support vectors as a floating-point matrix, where support vectors are stored as matrix rows. */ CV_WRAP virtual Mat getSupportVectors() const = 0; + /** @brief Retrieves all the uncompressed support vectors of a linear %SVM + + The method returns all the uncompressed support vectors of a linear %SVM that the compressed + support vector, used for prediction, was derived from. They are returned in a floating-point + matrix, where the support vectors are stored as matrix rows. + */ + CV_WRAP Mat getUncompressedSupportVectors() const; + /** @brief Retrieves the decision function @param i the index of the decision function. If the problem solved is regression, 1-class or diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp index 0fd73a389..757bb7a17 100644 --- a/modules/ml/src/svm.cpp +++ b/modules/ml/src/svm.cpp @@ -1241,6 +1241,12 @@ public: df_alpha.clear(); df_index.clear(); sv.release(); + uncompressed_sv.release(); + } + + Mat getUncompressedSupportVectors_() const + { + return uncompressed_sv; } Mat getSupportVectors() const @@ -1538,6 +1544,7 @@ public: } optimize_linear_svm(); + return true; } @@ -1588,6 +1595,7 @@ public: setRangeVector(df_index, df_count); df_alpha.assign(df_count, 1.); + sv.copyTo(uncompressed_sv); std::swap(sv, new_sv); std::swap(decision_func, new_df); } @@ -2056,6 +2064,21 @@ public: } fs << "]"; + if ( !uncompressed_sv.empty() ) + { + // write the joint collection of uncompressed support vectors + int uncompressed_sv_total = uncompressed_sv.rows; + fs << "uncompressed_sv_total" << uncompressed_sv_total; + fs << "uncompressed_support_vectors" << "["; + for( i = 0; i < uncompressed_sv_total; i++ ) + { + fs << "[:"; + fs.writeRaw("f", uncompressed_sv.ptr(i), uncompressed_sv.cols*uncompressed_sv.elemSize()); + fs << "]"; + } + fs << "]"; + } + // write decision functions int df_count = (int)decision_func.size(); @@ -2096,7 +2119,7 @@ public: svm_type_str == "NU_SVR" ? NU_SVR : -1; if( svmType < 0 ) - CV_Error( CV_StsParseError, "Missing of invalid SVM type" ); + CV_Error( CV_StsParseError, "Missing or invalid SVM type" ); FileNode kernel_node = fn["kernel"]; if( kernel_node.empty() ) @@ -2168,14 +2191,31 @@ public: FileNode sv_node = fn["support_vectors"]; CV_Assert((int)sv_node.size() == sv_total); - sv.create(sv_total, var_count, CV_32F); + sv.create(sv_total, var_count, CV_32F); FileNodeIterator sv_it = sv_node.begin(); for( i = 0; i < sv_total; i++, ++sv_it ) { (*sv_it).readRaw("f", sv.ptr(i), var_count*sv.elemSize()); } + int uncompressed_sv_total = (int)fn["uncompressed_sv_total"]; + + if( uncompressed_sv_total > 0 ) + { + // read uncompressed support vectors + FileNode uncompressed_sv_node = fn["uncompressed_support_vectors"]; + + CV_Assert((int)uncompressed_sv_node.size() == uncompressed_sv_total); + uncompressed_sv.create(uncompressed_sv_total, var_count, CV_32F); + + FileNodeIterator uncompressed_sv_it = uncompressed_sv_node.begin(); + for( i = 0; i < uncompressed_sv_total; i++, ++uncompressed_sv_it ) + { + (*uncompressed_sv_it).readRaw("f", uncompressed_sv.ptr(i), var_count*uncompressed_sv.elemSize()); + } + } + // read decision functions int df_count = class_count > 1 ? class_count*(class_count-1)/2 : 1; FileNode df_node = fn["decision_functions"]; @@ -2207,7 +2247,7 @@ public: SvmParams params; Mat class_labels; int var_count; - Mat sv; + Mat sv, uncompressed_sv; vector decision_func; vector df_alpha; vector df_index; @@ -2221,6 +2261,14 @@ Ptr SVM::create() return makePtr(); } +Mat SVM::getUncompressedSupportVectors() const +{ + const SVMImpl* this_ = dynamic_cast(this); + if(!this_) + CV_Error(Error::StsNotImplemented, "the class is not SVMImpl"); + return this_->getUncompressedSupportVectors_(); +} + } } diff --git a/modules/ml/test/test_svmtrainauto.cpp b/modules/ml/test/test_svmtrainauto.cpp index 3c4b72924..13cbe98f4 100644 --- a/modules/ml/test/test_svmtrainauto.cpp +++ b/modules/ml/test/test_svmtrainauto.cpp @@ -118,3 +118,51 @@ TEST(ML_SVM, trainAuto_regression_5369) EXPECT_EQ(0., result0); EXPECT_EQ(1., result1); } + +class CV_SVMGetSupportVectorsTest : public cvtest::BaseTest { +public: + CV_SVMGetSupportVectorsTest() {} +protected: + virtual void run( int startFrom ); +}; +void CV_SVMGetSupportVectorsTest::run(int /*startFrom*/ ) +{ + int code = cvtest::TS::OK; + + // Set up training data + int labels[4] = {1, -1, -1, -1}; + float trainingData[4][2] = { {501, 10}, {255, 10}, {501, 255}, {10, 501} }; + Mat trainingDataMat(4, 2, CV_32FC1, trainingData); + Mat labelsMat(4, 1, CV_32SC1, labels); + + Ptr svm = SVM::create(); + svm->setType(SVM::C_SVC); + svm->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER, 100, 1e-6)); + + + // Test retrieval of SVs and compressed SVs on linear SVM + svm->setKernel(SVM::LINEAR); + svm->train(trainingDataMat, cv::ml::ROW_SAMPLE, labelsMat); + + Mat sv = svm->getSupportVectors(); + CV_Assert(sv.rows == 1); // by default compressed SV returned + sv = svm->getUncompressedSupportVectors(); + CV_Assert(sv.rows == 3); + + + // Test retrieval of SVs and compressed SVs on non-linear SVM + svm->setKernel(SVM::POLY); + svm->setDegree(2); + svm->train(trainingDataMat, cv::ml::ROW_SAMPLE, labelsMat); + + sv = svm->getSupportVectors(); + CV_Assert(sv.rows == 3); + sv = svm->getUncompressedSupportVectors(); + CV_Assert(sv.rows == 0); // inapplicable for non-linear SVMs + + + ts->set_failed_test_info(code); +} + + +TEST(ML_SVM, getSupportVectors) { CV_SVMGetSupportVectorsTest test; test.safe_run(); } diff --git a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp index 0513e367d..9b0d569c6 100644 --- a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp +++ b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp @@ -65,7 +65,7 @@ int main(int, char**) //! [show_vectors] thickness = 2; lineType = 8; - Mat sv = svm->getSupportVectors(); + Mat sv = svm->getUncompressedSupportVectors(); for (int i = 0; i < sv.rows; ++i) { From 7a7b0bcfcbe2847708086637506acba8f3ecd832 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 10 Dec 2015 20:17:17 +0300 Subject: [PATCH 5/5] fixed the upper boundary when calling checkRange (thanks to alalek) --- modules/ml/src/data.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ml/src/data.cpp b/modules/ml/src/data.cpp index feb79d93a..a1608e398 100644 --- a/modules/ml/src/data.cpp +++ b/modules/ml/src/data.cpp @@ -253,7 +253,7 @@ public: if( !sampleIdx.empty() ) { CV_Assert( (sampleIdx.checkVector(1, CV_32S, true) > 0 && - checkRange(sampleIdx, true, 0, 0, nsamples-1)) || + checkRange(sampleIdx, true, 0, 0, nsamples)) || sampleIdx.checkVector(1, CV_8U, true) == nsamples ); if( sampleIdx.type() == CV_8U ) sampleIdx = convertMaskToIdx(sampleIdx);