normal bayes classifier has been parallelized using TBB; letter_recog sample updated to demosntrate knearest & bayes classifiers (thanks to Konstantin Krivakin for the patches)

2011-04-14 17:04:39 +00:00 · 2011-04-14 17:04:39 +00:00 · b9fa21d011
commit b9fa21d011
parent 56b206dc7b
2 changed files with 233 additions and 47 deletions
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@ -277,63 +277,74 @@ bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _res
    return result;
 }

+struct predict_body {
+  predict_body(CvMat* _c, CvMat** _cov_rotate_mats, CvMat** _inv_eigen_values, CvMat** _avg,
+     const CvMat* _samples, const int* _vidx, CvMat* _cls_labels,
+     CvMat* _results, float* _value, int _var_count1
+  )
+  {
+    c = _c;
+    cov_rotate_mats = _cov_rotate_mats;
+    inv_eigen_values = _inv_eigen_values;
+    avg = _avg;
+    samples = _samples;
+    vidx = _vidx;
+    cls_labels = _cls_labels;
+    results = _results;
+    value = _value;
+    var_count1 = _var_count1;
+  }
+  
+  CvMat* c;
+  CvMat** cov_rotate_mats;
+  CvMat** inv_eigen_values;
+  CvMat** avg;
+  const CvMat* samples;
+  const int* vidx;
+  CvMat* cls_labels;

-float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) const
-{
-    float value = 0;
+  CvMat* results;
+  float* value;
+  int var_count1;
+  
+  void operator()( const cv::BlockedRange& range ) const
+  {

-    int i, j, cls = -1;
-    double opt = FLT_MAX;
-    int rtype = 0, rstep = 0;
-    
+    int cls = -1;
+    int rtype = 0, rstep = 0; 
    int nclasses = cls_labels->cols;
    int _var_count = avg[0]->cols;
-
-    if( !CV_IS_MAT(samples) || CV_MAT_TYPE(samples->type) != CV_32FC1 || samples->cols != var_all )
-        CV_Error( CV_StsBadArg,
-        "The input samples must be 32f matrix with the number of columns = var_all" );
-
-    if( samples->rows > 1 && !results )
-        CV_Error( CV_StsNullPtr,
-        "When the number of input samples is >1, the output vector of results must be passed" );
-
-    if( results )
+    
+    if (results)
    {
-        if( !CV_IS_MAT(results) || (CV_MAT_TYPE(results->type) != CV_32FC1 &&
-        CV_MAT_TYPE(results->type) != CV_32SC1) ||
-        (results->cols != 1 && results->rows != 1) ||
-        results->cols + results->rows - 1 != samples->rows )
-        CV_Error( CV_StsBadArg, "The output array must be integer or floating-point vector "
-        "with the number of elements = number of rows in the input matrix" );
-
        rtype = CV_MAT_TYPE(results->type);
        rstep = CV_IS_MAT_CONT(results->type) ? 1 : results->step/CV_ELEM_SIZE(rtype);
    }
-
-    const int* vidx = var_idx ? var_idx->data.i : 0;
-
-// allocate memory and initializing headers for calculating
-    cv::AutoBuffer<double> buffer(nclasses + var_count);
-    CvMat diff = cvMat( 1, var_count, CV_64FC1, &buffer[0] );
-
-    for( int k = 0; k < samples->rows; k++ )
+    // allocate memory and initializing headers for calculating
+    cv::AutoBuffer<double> buffer(nclasses + var_count1);
+    CvMat diff = cvMat( 1, var_count1, CV_64FC1, &buffer[0] );
+    
+    for(int k = range.begin(); k < range.end(); k += 1 )
    {
        int ival;
+        double opt = FLT_MAX;

-        for( i = 0; i < nclasses; i++ )
+        for(int i = 0; i < nclasses; i++ )
        {
+
            double cur = c->data.db[i];
            CvMat* u = cov_rotate_mats[i];
            CvMat* w = inv_eigen_values[i];
+
            const double* avg_data = avg[i]->data.db;
            const float* x = (const float*)(samples->data.ptr + samples->step*k);

            // cov = u w u'  -->  cov^(-1) = u w^(-1) u'
-            for( j = 0; j < _var_count; j++ )
+            for(int j = 0; j < _var_count; j++ )
                diff.data.db[j] = avg_data[j] - x[vidx ? vidx[j] : j];

            cvGEMM( &diff, u, 1, 0, 0, &diff, CV_GEMM_B_T );
-            for( j = 0; j < _var_count; j++ )
+            for(int j = 0; j < _var_count; j++ )
            {
                double d = diff.data.db[j];
                cur += d*d*w->data.db[j];
@ -356,17 +367,39 @@ float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) c
                results->data.fl[k*rstep] = (float)ival;
        }
        if( k == 0 )
-            value = (float)ival;
-
-        /*if( _probs )
-        {
-            CV_CALL( cvConvertScale( &expo, &expo, -0.5 ));
-            CV_CALL( cvExp( &expo, &expo ));
-            if( _probs->cols == 1 )
-                CV_CALL( cvReshape( &expo, &expo, 1, nclasses ));
-            CV_CALL( cvConvertScale( &expo, _probs, 1./cvSum( &expo ).val[0] ));
-        }*/
+            *value = (float)ival;
    }
+  }
+};
+
+
+float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) const
+{
+    float value = 0;
+
+    if( !CV_IS_MAT(samples) || CV_MAT_TYPE(samples->type) != CV_32FC1 || samples->cols != var_all )
+        CV_Error( CV_StsBadArg,
+        "The input samples must be 32f matrix with the number of columns = var_all" );
+
+    if( samples->rows > 1 && !results )
+        CV_Error( CV_StsNullPtr,
+        "When the number of input samples is >1, the output vector of results must be passed" );
+
+    if( results )
+    {
+        if( !CV_IS_MAT(results) || (CV_MAT_TYPE(results->type) != CV_32FC1 &&
+        CV_MAT_TYPE(results->type) != CV_32SC1) ||
+        (results->cols != 1 && results->rows != 1) ||
+        results->cols + results->rows - 1 != samples->rows )
+        CV_Error( CV_StsBadArg, "The output array must be integer or floating-point vector "
+        "with the number of elements = number of rows in the input matrix" );
+    }
+
+    const int* vidx = var_idx ? var_idx->data.i : 0;
+
+    cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
+                                                                      vidx, cls_labels, results, &value, var_count
+    ));

    return value;
 }
--- a/samples/cpp/letter_recog.cpp
+++ b/samples/cpp/letter_recog.cpp
@ -9,7 +9,7 @@
 void help()
 {
 	printf("\nThe sample demonstrates how to train Random Trees classifier\n"
-	"(or Boosting classifier, or MLP - see main()) using the provided dataset.\n"
+	"(or Boosting classifier, or MLP, or Knearest, or Nbayes - see main()) using the provided dataset.\n"
 	"\n"
 	"We use the sample database letter-recognition.data\n"
 	"from UCI Repository, here is the link:\n"
@ -28,7 +28,7 @@ void help()
            "The usage: letter_recog [-data <path to letter-recognition.data>] \\\n"
            "  [-save <output XML file for the classifier>] \\\n"
            "  [-load <XML file with the pre-trained classifier>] \\\n"
-            "  [-boost|-mlp] # to use boost/mlp classifier instead of default Random Trees\n" );
+            "  [-boost|-mlp|-knearest|-nbayes] # to use boost/mlp/knearest classifier instead of default Random Trees\n" );
 }

 // This function reads data and responses from the file <filename>
@ -484,6 +484,147 @@ int build_mlp_classifier( char* data_filename,
    return 0;
 }

+static
+int build_knearest_classifier( char* data_filename, int K )
+{
+    const int var_count = 16;
+    CvMat* data = 0;
+    CvMat train_data;
+    CvMat* responses;
+
+    int ok = read_num_class_data( data_filename, 16, &data, &responses );
+    int nsamples_all = 0, ntrain_samples = 0;
+    int i, j;
+    double train_hr = 0, test_hr = 0;
+    CvANN_MLP mlp;
+
+    if( !ok )
+    {
+        printf( "Could not read the database %s\n", data_filename );
+        return -1;
+    }
+
+    printf( "The database %s is loaded.\n", data_filename );
+    nsamples_all = data->rows;
+    ntrain_samples = (int)(nsamples_all*0.8);
+
+    // 1. unroll the responses
+    printf( "Unrolling the responses...\n");
+    cvGetRows( data, &train_data, 0, ntrain_samples );
+
+    // 2. train classifier
+    CvMat* train_resp = cvCreateMat( ntrain_samples, 1, CV_32FC1);
+    for (int i = 0; i < ntrain_samples; i++)
+        train_resp->data.fl[i] = responses->data.fl[i];
+    CvKNearest knearest(&train_data, train_resp);
+
+    CvMat* nearests = cvCreateMat( (nsamples_all - ntrain_samples), K, CV_32FC1);
+    float _sample[var_count * (nsamples_all - ntrain_samples)];
+    CvMat sample = cvMat( nsamples_all - ntrain_samples, 16, CV_32FC1, _sample );
+    float true_results[nsamples_all - ntrain_samples];
+    for (int j = ntrain_samples; j < nsamples_all; j++)
+    {
+        float *s = data->data.fl + j * var_count;
+        
+        for (int i = 0; i < var_count; i++)
+        {   
+            sample.data.fl[(j - ntrain_samples) * var_count + i] = s[i];
+        }
+        true_results[j - ntrain_samples] = responses->data.fl[j];
+    }
+    CvMat *result = cvCreateMat(1, nsamples_all - ntrain_samples, CV_32FC1);
+	knearest.find_nearest(&sample, K, result, 0, nearests, 0);
+    int true_resp = 0;
+    int accuracy = 0;
+    for (int i = 0; i < nsamples_all - ntrain_samples; i++)
+    {
+        if (result->data.fl[i] == true_results[i])
+            true_resp++;
+        for(int k = 0; k < K; k++ )
+        {
+            if( nearests->data.fl[i * K + k] == true_results[i])
+            accuracy++;
+        }
+    }
+    
+    printf("true_resp = %f%%\tavg accuracy = %f%%\n", (float)true_resp / (nsamples_all - ntrain_samples) * 100, 
+                                                      (float)accuracy / (nsamples_all - ntrain_samples) / K * 100);
+    
+    cvReleaseMat( &train_resp );
+    cvReleaseMat( &nearests );
+    cvReleaseMat( &result );
+    cvReleaseMat( &data );
+    cvReleaseMat( &responses );
+
+    return 0;
+}
+
+static
+int build_nbayes_classifier( char* data_filename )
+{
+    const int var_count = 16;
+    CvMat* data = 0;
+    CvMat train_data;
+    CvMat* responses;
+
+    int ok = read_num_class_data( data_filename, 16, &data, &responses );
+    int nsamples_all = 0, ntrain_samples = 0;
+    int i, j;
+    double train_hr = 0, test_hr = 0;
+    CvANN_MLP mlp;
+
+    if( !ok )
+    {
+        printf( "Could not read the database %s\n", data_filename );
+        return -1;
+    }
+
+    printf( "The database %s is loaded.\n", data_filename );
+    nsamples_all = data->rows;
+    ntrain_samples = (int)(nsamples_all*0.5);
+
+    // 1. unroll the responses
+    printf( "Unrolling the responses...\n");
+    cvGetRows( data, &train_data, 0, ntrain_samples );
+
+    // 2. train classifier
+    CvMat* train_resp = cvCreateMat( ntrain_samples, 1, CV_32FC1);
+    for (int i = 0; i < ntrain_samples; i++)
+        train_resp->data.fl[i] = responses->data.fl[i];
+    CvNormalBayesClassifier nbayes(&train_data, train_resp);
+
+    float _sample[var_count * (nsamples_all - ntrain_samples)];
+    CvMat sample = cvMat( nsamples_all - ntrain_samples, 16, CV_32FC1, _sample );
+    float true_results[nsamples_all - ntrain_samples];
+    for (int j = ntrain_samples; j < nsamples_all; j++)
+    {
+        float *s = data->data.fl + j * var_count;
+        
+        for (int i = 0; i < var_count; i++)
+        {   
+            sample.data.fl[(j - ntrain_samples) * var_count + i] = s[i];
+        }
+        true_results[j - ntrain_samples] = responses->data.fl[j];
+    }
+    CvMat *result = cvCreateMat(1, nsamples_all - ntrain_samples, CV_32FC1);
+    (int)nbayes.predict(&sample, result);
+    int true_resp = 0;
+    int accuracy = 0;
+    for (int i = 0; i < nsamples_all - ntrain_samples; i++)
+    {
+        if (result->data.fl[i] == true_results[i])
+            true_resp++;
+    }
+    
+    printf("true_resp = %f%%\n", (float)true_resp / (nsamples_all - ntrain_samples) * 100);
+    
+    cvReleaseMat( &train_resp );
+    cvReleaseMat( &result );
+    cvReleaseMat( &data );
+    cvReleaseMat( &responses );
+
+    return 0;
+}

 int main( int argc, char *argv[] )
 {
@ -519,6 +660,14 @@ int main( int argc, char *argv[] )
        {
            method = 2;
        }
+        else if ( strcmp(argv[i], "-knearest") == 0)
+	{
+	    method = 3;
+	}
+	else if ( strcmp(argv[i], "-nbayes") == 0)
+	{
+	    method = 4;
+	}
        else
            break;
    }
@ -530,6 +679,10 @@ int main( int argc, char *argv[] )
        build_boost_classifier( data_filename, filename_to_save, filename_to_load ) :
        method == 2 ?
        build_mlp_classifier( data_filename, filename_to_save, filename_to_load ) :
+        method == 3 ?
+        build_knearest_classifier( data_filename, 10 ) :
+        method == 4 ?
+        build_nbayes_classifier( data_filename) :
        -1) < 0)
    {
    	help();