skip of table header in MLData (#1962)
This commit is contained in:
parent
094c32ced7
commit
4d36be8794
@ -3,7 +3,7 @@ MLData
|
|||||||
|
|
||||||
.. highlight:: cpp
|
.. highlight:: cpp
|
||||||
|
|
||||||
For the machine learning algorithms, the data set is often stored in a file of the ``.csv``-like format. The file contains a table of predictor and response values where each row of the table corresponds to a sample. Missing values are supported. The UC Irvine Machine Learning Repository (http://archive.ics.uci.edu/ml/) provides many data sets stored in such a format to the machine learning community. The class ``MLData`` is implemented to easily load the data for training one of the OpenCV machine learning algorithms. For float values, only the ``'.'`` separator is supported.
|
For the machine learning algorithms, the data set is often stored in a file of the ``.csv``-like format. The file contains a table of predictor and response values where each row of the table corresponds to a sample. Missing values are supported. The UC Irvine Machine Learning Repository (http://archive.ics.uci.edu/ml/) provides many data sets stored in such a format to the machine learning community. The class ``MLData`` is implemented to easily load the data for training one of the OpenCV machine learning algorithms. For float values, only the ``'.'`` separator is supported. The table can have a header and in such case the user have to set the number of the header lines to skip them duaring the file reading.
|
||||||
|
|
||||||
CvMLData
|
CvMLData
|
||||||
--------
|
--------
|
||||||
@ -182,6 +182,20 @@ Sets the variables types in the loaded data.
|
|||||||
|
|
||||||
In the string, a variable type is followed by a list of variables indices. For example: ``"ord[0-17],cat[18]"``, ``"ord[0,2,4,10-12], cat[1,3,5-9,13,14]"``, ``"cat"`` (all variables are categorical), ``"ord"`` (all variables are ordered).
|
In the string, a variable type is followed by a list of variables indices. For example: ``"ord[0-17],cat[18]"``, ``"ord[0,2,4,10-12], cat[1,3,5-9,13,14]"``, ``"cat"`` (all variables are categorical), ``"ord"`` (all variables are ordered).
|
||||||
|
|
||||||
|
CvMLData::get_header_lines_number
|
||||||
|
---------------------------------
|
||||||
|
Returns a number of the table header lines.
|
||||||
|
|
||||||
|
.. ocv:function:: int CvMLData::get_header_lines_number() const
|
||||||
|
|
||||||
|
CvMLData::set_header_lines_number
|
||||||
|
---------------------------------
|
||||||
|
Sets a number of the table header lines.
|
||||||
|
|
||||||
|
.. ocv:function:: void CvMLData::set_header_lines_number( int n )
|
||||||
|
|
||||||
|
By default it is supposed that the table does not have a header, i.e. it contains only the data.
|
||||||
|
|
||||||
CvMLData::get_var_type
|
CvMLData::get_var_type
|
||||||
----------------------
|
----------------------
|
||||||
Returns type of the specified variable
|
Returns type of the specified variable
|
||||||
|
@ -2040,6 +2040,9 @@ public:
|
|||||||
const CvMat* get_responses();
|
const CvMat* get_responses();
|
||||||
const CvMat* get_missing() const;
|
const CvMat* get_missing() const;
|
||||||
|
|
||||||
|
void set_header_lines_number( int n );
|
||||||
|
int get_header_lines_number() const;
|
||||||
|
|
||||||
void set_response_idx( int idx ); // old response become predictors, new response_idx = idx
|
void set_response_idx( int idx ); // old response become predictors, new response_idx = idx
|
||||||
// if idx < 0 there will be no response
|
// if idx < 0 there will be no response
|
||||||
int get_response_idx() const;
|
int get_response_idx() const;
|
||||||
@ -2091,6 +2094,8 @@ protected:
|
|||||||
CvMat* var_idx_out; // mat
|
CvMat* var_idx_out; // mat
|
||||||
CvMat* var_types_out; // mat
|
CvMat* var_types_out; // mat
|
||||||
|
|
||||||
|
int header_lines_number;
|
||||||
|
|
||||||
int response_idx;
|
int response_idx;
|
||||||
|
|
||||||
int train_sample_count;
|
int train_sample_count;
|
||||||
|
@ -71,6 +71,7 @@ CvMLData::CvMLData()
|
|||||||
{
|
{
|
||||||
values = missing = var_types = var_idx_mask = response_out = var_idx_out = var_types_out = 0;
|
values = missing = var_types = var_idx_mask = response_out = var_idx_out = var_types_out = 0;
|
||||||
train_sample_idx = test_sample_idx = 0;
|
train_sample_idx = test_sample_idx = 0;
|
||||||
|
header_lines_number = 0;
|
||||||
sample_idx = 0;
|
sample_idx = 0;
|
||||||
response_idx = -1;
|
response_idx = -1;
|
||||||
|
|
||||||
@ -117,6 +118,17 @@ void CvMLData::clear()
|
|||||||
train_sample_count = -1;
|
train_sample_count = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void CvMLData::set_header_lines_number( int idx )
|
||||||
|
{
|
||||||
|
header_lines_number = std::max(0, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
int CvMLData::get_header_lines_number() const
|
||||||
|
{
|
||||||
|
return header_lines_number;
|
||||||
|
}
|
||||||
|
|
||||||
static char *fgets_chomp(char *str, int n, FILE *stream)
|
static char *fgets_chomp(char *str, int n, FILE *stream)
|
||||||
{
|
{
|
||||||
char *head = fgets(str, n, stream);
|
char *head = fgets(str, n, stream);
|
||||||
@ -153,9 +165,15 @@ int CvMLData::read_csv(const char* filename)
|
|||||||
if( !file )
|
if( !file )
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
// read the first line and determine the number of variables
|
std::vector<char> _buf(M);
|
||||||
std::vector<char> _buf(M);
|
|
||||||
char* buf = &_buf[0];
|
char* buf = &_buf[0];
|
||||||
|
|
||||||
|
// skip header lines
|
||||||
|
for( int i = 0; i < header_lines_number; i++ )
|
||||||
|
if( fgets( buf, M, file ) == 0 )
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
// read the first data line and determine the number of variables
|
||||||
if( !fgets_chomp( buf, M, file ))
|
if( !fgets_chomp( buf, M, file ))
|
||||||
{
|
{
|
||||||
fclose(file);
|
fclose(file);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user