initial commit; ml has been refactored; it compiles and the tests run well; some other modules, apps and samples do not compile; to be fixed

2014-07-29 23:54:23 +04:00
parent dce1824a91
commit ba3783d205
25 changed files with 8320 additions and 21792 deletions
--- a/modules/ml/src/precomp.hpp
+++ b/modules/ml/src/precomp.hpp
@@ -38,8 +38,8 @@
 //
 //M*/

-#ifndef __OPENCV_PRECOMP_H__
-#define __OPENCV_PRECOMP_H__
+#ifndef __OPENCV_ML_PRECOMP_HPP__
+#define __OPENCV_ML_PRECOMP_HPP__

 #include "opencv2/core.hpp"
 #include "opencv2/ml.hpp"
@@ -56,321 +56,217 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
-
-#define ML_IMPL CV_IMPL
-#define __BEGIN__ __CV_BEGIN__
-#define __END__ __CV_END__
-#define EXIT __CV_EXIT__
-
-#define CV_MAT_ELEM_FLAG( mat, type, comp, vect, tflag )    \
-    (( tflag == CV_ROW_SAMPLE )                             \
-    ? (CV_MAT_ELEM( mat, type, comp, vect ))                \
-    : (CV_MAT_ELEM( mat, type, vect, comp )))
-
-/* Convert matrix to vector */
-#define ICV_MAT2VEC( mat, vdata, vstep, num )      \
-    if( MIN( (mat).rows, (mat).cols ) != 1 )       \
-        CV_ERROR( CV_StsBadArg, "" );              \
-    (vdata) = ((mat).data.ptr);                    \
-    if( (mat).rows == 1 )                          \
-    {                                              \
-        (vstep) = CV_ELEM_SIZE( (mat).type );      \
-        (num) = (mat).cols;                        \
-    }                                              \
-    else                                           \
-    {                                              \
-        (vstep) = (mat).step;                      \
-        (num) = (mat).rows;                        \
-    }
-
-/* get raw data */
-#define ICV_RAWDATA( mat, flags, rdata, sstep, cstep, m, n )         \
-    (rdata) = (mat).data.ptr;                                        \
-    if( CV_IS_ROW_SAMPLE( flags ) )                                  \
-    {                                                                \
-        (sstep) = (mat).step;                                        \
-        (cstep) = CV_ELEM_SIZE( (mat).type );                        \
-        (m) = (mat).rows;                                            \
-        (n) = (mat).cols;                                            \
-    }                                                                \
-    else                                                             \
-    {                                                                \
-        (cstep) = (mat).step;                                        \
-        (sstep) = CV_ELEM_SIZE( (mat).type );                        \
-        (n) = (mat).rows;                                            \
-        (m) = (mat).cols;                                            \
-    }
-
-#define ICV_IS_MAT_OF_TYPE( mat, mat_type) \
-    (CV_IS_MAT( mat ) && CV_MAT_TYPE( mat->type ) == (mat_type) &&   \
-    (mat)->cols > 0 && (mat)->rows > 0)
-
-/*
-    uchar* data; int sstep, cstep;      - trainData->data
-    uchar* classes; int clstep; int ncl;- trainClasses
-    uchar* tmask; int tmstep; int ntm;  - typeMask
-    uchar* missed;int msstep, mcstep;   -missedMeasurements...
-    int mm, mn;                         == m,n == size,dim
-    uchar* sidx;int sistep;             - sampleIdx
-    uchar* cidx;int cistep;             - compIdx
-    int k, l;                           == n,m == dim,size (length of cidx, sidx)
-    int m, n;                           == size,dim
-*/
-#define ICV_DECLARE_TRAIN_ARGS()                                                    \
-    uchar* data;                                                                    \
-    int sstep, cstep;                                                               \
-    uchar* classes;                                                                 \
-    int clstep;                                                                     \
-    int ncl;                                                                        \
-    uchar* tmask;                                                                   \
-    int tmstep;                                                                     \
-    int ntm;                                                                        \
-    uchar* missed;                                                                  \
-    int msstep, mcstep;                                                             \
-    int mm, mn;                                                                     \
-    uchar* sidx;                                                                    \
-    int sistep;                                                                     \
-    uchar* cidx;                                                                    \
-    int cistep;                                                                     \
-    int k, l;                                                                       \
-    int m, n;                                                                       \
-                                                                                    \
-    data = classes = tmask = missed = sidx = cidx = NULL;                           \
-    sstep = cstep = clstep = ncl = tmstep = ntm = msstep = mcstep = mm = mn = 0;    \
-    sistep = cistep = k = l = m = n = 0;
-
-#define ICV_TRAIN_DATA_REQUIRED( param, flags )                                     \
-    if( !ICV_IS_MAT_OF_TYPE( (param), CV_32FC1 ) )                                  \
-    {                                                                               \
-        CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );                   \
-    }                                                                               \
-    else                                                                            \
-    {                                                                               \
-        ICV_RAWDATA( *(param), (flags), data, sstep, cstep, m, n );                 \
-        k = n;                                                                      \
-        l = m;                                                                      \
-    }
-
-#define ICV_TRAIN_CLASSES_REQUIRED( param )                                         \
-    if( !ICV_IS_MAT_OF_TYPE( (param), CV_32FC1 ) )                                  \
-    {                                                                               \
-        CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );                   \
-    }                                                                               \
-    else                                                                            \
-    {                                                                               \
-        ICV_MAT2VEC( *(param), classes, clstep, ncl );                              \
-        if( m != ncl )                                                              \
-        {                                                                           \
-            CV_ERROR( CV_StsBadArg, "Unmatched sizes" );                            \
-        }                                                                           \
-    }
-
-#define ICV_ARG_NULL( param )                                                       \
-    if( (param) != NULL )                                                           \
-    {                                                                               \
-        CV_ERROR( CV_StsBadArg, #param " parameter must be NULL" );                 \
-    }
-
-#define ICV_MISSED_MEASUREMENTS_OPTIONAL( param, flags )                            \
-    if( param )                                                                     \
-    {                                                                               \
-        if( !ICV_IS_MAT_OF_TYPE( param, CV_8UC1 ) )                                 \
-        {                                                                           \
-            CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );               \
-        }                                                                           \
-        else                                                                        \
-        {                                                                           \
-            ICV_RAWDATA( *(param), (flags), missed, msstep, mcstep, mm, mn );       \
-            if( mm != m || mn != n )                                                \
-            {                                                                       \
-                CV_ERROR( CV_StsBadArg, "Unmatched sizes" );                        \
-            }                                                                       \
-        }                                                                           \
-    }
-
-#define ICV_COMP_IDX_OPTIONAL( param )                                              \
-    if( param )                                                                     \
-    {                                                                               \
-        if( !ICV_IS_MAT_OF_TYPE( param, CV_32SC1 ) )                                \
-        {                                                                           \
-            CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );               \
-        }                                                                           \
-        else                                                                        \
-        {                                                                           \
-            ICV_MAT2VEC( *(param), cidx, cistep, k );                               \
-            if( k > n )                                                             \
-                CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );           \
-        }                                                                           \
-    }
-
-#define ICV_SAMPLE_IDX_OPTIONAL( param )                                            \
-    if( param )                                                                     \
-    {                                                                               \
-        if( !ICV_IS_MAT_OF_TYPE( param, CV_32SC1 ) )                                \
-        {                                                                           \
-            CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );               \
-        }                                                                           \
-        else                                                                        \
-        {                                                                           \
-            ICV_MAT2VEC( *sampleIdx, sidx, sistep, l );                             \
-            if( l > m )                                                             \
-                CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );           \
-        }                                                                           \
-    }
-
-/****************************************************************************************/
-#define ICV_CONVERT_FLOAT_ARRAY_TO_MATRICE( array, matrice )        \
-{                                                                   \
-    CvMat a, b;                                                     \
-    int dims = (matrice)->cols;                                     \
-    int nsamples = (matrice)->rows;                                 \
-    int type = CV_MAT_TYPE((matrice)->type);                        \
-    int i, offset = dims;                                           \
-                                                                    \
-    CV_ASSERT( type == CV_32FC1 || type == CV_64FC1 );              \
-    offset *= ((type == CV_32FC1) ? sizeof(float) : sizeof(double));\
-                                                                    \
-    b = cvMat( 1, dims, CV_32FC1 );                                 \
-    cvGetRow( matrice, &a, 0 );                                     \
-    for( i = 0; i < nsamples; i++, a.data.ptr += offset )           \
-    {                                                               \
-        b.data.fl = (float*)array[i];                               \
-        CV_CALL( cvConvert( &b, &a ) );                             \
-    }                                                               \
-}
+#include <vector>

 /****************************************************************************************\
-*                       Auxiliary functions declarations                                 *
-\****************************************************************************************/
+ *                               Main struct definitions                                  *
+ \****************************************************************************************/

-/* Generates a set of classes centers in quantity <num_of_clusters> that are generated as
-   uniform random vectors in parallelepiped, where <data> is concentrated. Vectors in
-   <data> should have horizontal orientation. If <centers> != NULL, the function doesn't
-   allocate any memory and stores generated centers in <centers>, returns <centers>.
-   If <centers> == NULL, the function allocates memory and creates the matrice. Centers
-   are supposed to be oriented horizontally. */
-CvMat* icvGenerateRandomClusterCenters( int seed,
-                                        const CvMat* data,
-                                        int num_of_clusters,
-                                        CvMat* centers CV_DEFAULT(0));
-
-/* Fills the <labels> using <probs> by choosing the maximal probability. Outliers are
-   fixed by <oulier_tresh> and have cluster label (-1). Function also controls that there
-   weren't "empty" clusters by filling empty clusters with the maximal probability vector.
-   If probs_sums != NULL, filles it with the sums of probabilities for each sample (it is
-   useful for normalizing probabilities' matrice of FCM) */
-void icvFindClusterLabels( const CvMat* probs, float outlier_thresh, float r,
-                           const CvMat* labels );
-
-typedef struct CvSparseVecElem32f
-{
-    int idx;
-    float val;
-}
-CvSparseVecElem32f;
-
-/* Prepare training data and related parameters */
-#define CV_TRAIN_STATMODEL_DEFRAGMENT_TRAIN_DATA    1
-#define CV_TRAIN_STATMODEL_SAMPLES_AS_ROWS          2
-#define CV_TRAIN_STATMODEL_SAMPLES_AS_COLUMNS       4
-#define CV_TRAIN_STATMODEL_CATEGORICAL_RESPONSE     8
-#define CV_TRAIN_STATMODEL_ORDERED_RESPONSE         16
-#define CV_TRAIN_STATMODEL_RESPONSES_ON_OUTPUT      32
-#define CV_TRAIN_STATMODEL_ALWAYS_COPY_TRAIN_DATA   64
-#define CV_TRAIN_STATMODEL_SPARSE_AS_SPARSE         128
-
-int
-cvPrepareTrainData( const char* /*funcname*/,
-                    const CvMat* train_data, int tflag,
-                    const CvMat* responses, int response_type,
-                    const CvMat* var_idx,
-                    const CvMat* sample_idx,
-                    bool always_copy_data,
-                    const float*** out_train_samples,
-                    int* _sample_count,
-                    int* _var_count,
-                    int* _var_all,
-                    CvMat** out_responses,
-                    CvMat** out_response_map,
-                    CvMat** out_var_idx,
-                    CvMat** out_sample_idx=0 );
-
-void
-cvSortSamplesByClasses( const float** samples, const CvMat* classes,
-                        int* class_ranges, const uchar** mask CV_DEFAULT(0) );
-
-void
-cvCombineResponseMaps (CvMat*  _responses,
-                 const CvMat*  old_response_map,
-                       CvMat*  new_response_map,
-                       CvMat** out_response_map);
-
-void
-cvPreparePredictData( const CvArr* sample, int dims_all, const CvMat* comp_idx,
-                      int class_count, const CvMat* prob, float** row_sample,
-                      int as_sparse CV_DEFAULT(0) );
-
-/* copies clustering [or batch "predict"] results
-   (labels and/or centers and/or probs) back to the output arrays */
-void
-cvWritebackLabels( const CvMat* labels, CvMat* dst_labels,
-                   const CvMat* centers, CvMat* dst_centers,
-                   const CvMat* probs, CvMat* dst_probs,
-                   const CvMat* sample_idx, int samples_all,
-                   const CvMat* comp_idx, int dims_all );
-#define cvWritebackResponses cvWritebackLabels
-
-#define XML_FIELD_NAME "_name"
-CvFileNode* icvFileNodeGetChild(CvFileNode* father, const char* name);
-CvFileNode* icvFileNodeGetChildArrayElem(CvFileNode* father, const char* name,int index);
-CvFileNode* icvFileNodeGetNext(CvFileNode* n, const char* name);
-
-
-void cvCheckTrainData( const CvMat* train_data, int tflag,
-                       const CvMat* missing_mask,
-                       int* var_all, int* sample_all );
-
-CvMat* cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, bool check_for_duplicates=false );
-
-CvMat* cvPreprocessVarType( const CvMat* type_mask, const CvMat* var_idx,
-                            int var_all, int* response_type );
-
-CvMat* cvPreprocessOrderedResponses( const CvMat* responses,
-                const CvMat* sample_idx, int sample_all );
-
-CvMat* cvPreprocessCategoricalResponses( const CvMat* responses,
-                const CvMat* sample_idx, int sample_all,
-                CvMat** out_response_map, CvMat** class_counts=0 );
-
-const float** cvGetTrainSamples( const CvMat* train_data, int tflag,
-                   const CvMat* var_idx, const CvMat* sample_idx,
-                   int* _var_count, int* _sample_count,
-                   bool always_copy_data=false );
+/* log(2*PI) */
+#define CV_LOG2PI (1.8378770664093454835606594728112)

 namespace cv
 {
-    struct DTreeBestSplitFinder
+namespace ml
+{
+    using std::vector;
+
+    #define CV_DTREE_CAT_DIR(idx,subset) \
+        (2*((subset[(idx)>>5]&(1 << ((idx) & 31)))==0)-1)
+
+    template<typename _Tp> struct cmp_lt_idx
    {
-        DTreeBestSplitFinder(){ splitSize = 0, tree = 0; node = 0; }
-        DTreeBestSplitFinder( CvDTree* _tree, CvDTreeNode* _node);
-        DTreeBestSplitFinder( const DTreeBestSplitFinder& finder, Split );
-        virtual ~DTreeBestSplitFinder() {}
-        virtual void operator()(const BlockedRange& range);
-        void join( DTreeBestSplitFinder& rhs );
-        Ptr<CvDTreeSplit> bestSplit;
-        Ptr<CvDTreeSplit> split;
-        int splitSize;
-        CvDTree* tree;
-        CvDTreeNode* node;
+        cmp_lt_idx(const _Tp* _arr) : arr(_arr) {}
+        bool operator ()(int a, int b) const { return arr[a] < arr[b]; }
+        const _Tp* arr;
    };

-    struct ForestTreeBestSplitFinder : DTreeBestSplitFinder
+    template<typename _Tp> struct cmp_lt_ptr
    {
-        ForestTreeBestSplitFinder() : DTreeBestSplitFinder() {}
-        ForestTreeBestSplitFinder( CvForestTree* _tree, CvDTreeNode* _node );
-        ForestTreeBestSplitFinder( const ForestTreeBestSplitFinder& finder, Split );
-        virtual void operator()(const BlockedRange& range);
+        cmp_lt_ptr() {}
+        bool operator ()(const _Tp* a, const _Tp* b) const { return *a < *b; }
    };
-}

-#endif /* __ML_H__ */
+    static inline void setRangeVector(std::vector<int>& vec, int n)
+    {
+        vec.resize(n);
+        for( int i = 0; i < n; i++ )
+            vec[i] = i;
+    }
+
+    static inline void writeTermCrit(FileStorage& fs, const TermCriteria& termCrit)
+    {
+        if( (termCrit.type & TermCriteria::EPS) != 0 )
+            fs << "epsilon" << termCrit.epsilon;
+        if( (termCrit.type & TermCriteria::COUNT) != 0 )
+            fs << "iterations" << termCrit.maxCount;
+    }
+
+    static inline TermCriteria readTermCrit(const FileNode& fn)
+    {
+        TermCriteria termCrit;
+        double epsilon = (double)fn["epsilon"];
+        if( epsilon > 0 )
+        {
+            termCrit.type |= TermCriteria::EPS;
+            termCrit.epsilon = epsilon;
+        }
+        int iters = (double)fn["iterations"];
+        if( iters > 0 )
+        {
+            termCrit.type |= TermCriteria::COUNT;
+            termCrit.maxCount = iters;
+        }
+        return termCrit;
+    }
+
+    class DTreesImpl : public DTrees
+    {
+    public:
+        struct WNode
+        {
+            WNode()
+            {
+                class_idx = sample_count = depth = complexity = 0;
+                parent = left = right = split = defaultDir = -1;
+                Tn = INT_MAX;
+                value = maxlr = alpha = node_risk = tree_risk = tree_error = 0.;
+            }
+
+            int class_idx;
+            int Tn;
+            double value;
+
+            int parent;
+            int left;
+            int right;
+            int defaultDir;
+
+            int split;
+
+            int sample_count;
+            int depth;
+            double maxlr;
+
+            // global pruning data
+            int complexity;
+            double alpha;
+            double node_risk, tree_risk, tree_error;
+        };
+
+        struct WSplit
+        {
+            WSplit()
+            {
+                varIdx = inversed = next = 0;
+                quality = c = 0.f;
+                subsetOfs = -1;
+            }
+
+            int varIdx;
+            int inversed;
+            float quality;
+            int next;
+            float c;
+            int subsetOfs;
+        };
+
+        struct WorkData
+        {
+            WorkData(const Ptr<TrainData>& _data);
+
+            Ptr<TrainData> data;
+            vector<WNode> wnodes;
+            vector<WSplit> wsplits;
+            vector<int> wsubsets;
+            vector<int> cv_Tn;
+            vector<double> cv_node_risk;
+            vector<double> cv_node_error;
+            vector<int> cv_labels;
+            vector<double> sample_weights;
+            vector<int> cat_responses;
+            vector<double> ord_responses;
+            vector<int> sidx;
+            int maxSubsetSize;
+        };
+
+        DTreesImpl();
+        virtual ~DTreesImpl();
+        virtual void clear();
+
+        String getDefaultModelName() const { return "opencv_ml_dtree"; }
+        bool isTrained() const { return !roots.empty(); }
+        bool isClassifier() const { return _isClassifier; }
+        int getVarCount() const { return varType.empty() ? 0 : (int)(varType.size() - 1); }
+        int getCatCount(int vi) const { return catOfs[vi][1] - catOfs[vi][0]; }
+        int getSubsetSize(int vi) const { return (getCatCount(vi) + 31)/32; }
+
+        virtual void setDParams(const Params& _params);
+        virtual Params getDParams() const;
+        virtual void startTraining( const Ptr<TrainData>& trainData, int flags );
+        virtual void endTraining();
+        virtual void initCompVarIdx();
+        virtual bool train( const Ptr<TrainData>& trainData, int flags );
+
+        virtual int addTree( const vector<int>& sidx );
+        virtual int addNodeAndTrySplit( int parent, const vector<int>& sidx );
+        virtual const vector<int>& getActiveVars();
+        virtual int findBestSplit( const vector<int>& _sidx );
+        virtual void calcValue( int nidx, const vector<int>& _sidx );
+
+        virtual WSplit findSplitOrdClass( int vi, const vector<int>& _sidx, double initQuality );
+
+        // simple k-means, slightly modified to take into account the "weight" (L1-norm) of each vector.
+        virtual void clusterCategories( const double* vectors, int n, int m, double* csums, int k, int* labels );
+        virtual WSplit findSplitCatClass( int vi, const vector<int>& _sidx, double initQuality, int* subset );
+
+        virtual WSplit findSplitOrdReg( int vi, const vector<int>& _sidx, double initQuality );
+        virtual WSplit findSplitCatReg( int vi, const vector<int>& _sidx, double initQuality, int* subset );
+
+        virtual int calcDir( int splitidx, const vector<int>& _sidx, vector<int>& _sleft, vector<int>& _sright );
+        virtual int pruneCV( int root );
+
+        virtual double updateTreeRNC( int root, double T, int fold );
+        virtual bool cutTree( int root, double T, int fold, double min_alpha );
+        virtual float predictTrees( const Range& range, const Mat& sample, int flags ) const;
+        virtual float predict( InputArray inputs, OutputArray outputs, int flags ) const;
+
+        virtual void writeTrainingParams( FileStorage& fs ) const;
+        virtual void writeParams( FileStorage& fs ) const;
+        virtual void writeSplit( FileStorage& fs, int splitidx ) const;
+        virtual void writeNode( FileStorage& fs, int nidx, int depth ) const;
+        virtual void writeTree( FileStorage& fs, int root ) const;
+        virtual void write( FileStorage& fs ) const;
+
+        virtual void readParams( const FileNode& fn );
+        virtual int readSplit( const FileNode& fn );
+        virtual int readNode( const FileNode& fn );
+        virtual int readTree( const FileNode& fn );
+        virtual void read( const FileNode& fn );
+
+        virtual const std::vector<int>& getRoots() const { return roots; }
+        virtual const std::vector<Node>& getNodes() const { return nodes; }
+        virtual const std::vector<Split>& getSplits() const { return splits; }
+        virtual const std::vector<int>& getSubsets() const { return subsets; }
+        
+        Params params0, params;
+        
+        vector<int> varIdx;
+        vector<int> compVarIdx;
+        vector<uchar> varType;
+        vector<Vec2i> catOfs;
+        vector<int> catMap;
+        vector<int> roots;
+        vector<Node> nodes;
+        vector<Split> splits;
+        vector<int> subsets;
+        vector<int> classLabels;
+        vector<float> missingSubst;
+        bool _isClassifier;
+        
+        Ptr<WorkData> w;
+    };
+
+}}
+
+#endif /* __OPENCV_ML_PRECOMP_HPP__ */