speech-tools/stats/EST_ols.cc
2015-09-19 10:52:26 +02:00

344 lines
9.9 KiB
C++

/*************************************************************************/
/* */
/* Centre for Speech Technology Research */
/* University of Edinburgh, UK */
/* Copyright (c) 1998 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author : Alan W Black (and lots of books) */
/* Date : January 1998 */
/*-----------------------------------------------------------------------*/
/* Ordinary Least Squares/Linear regression */
/* */
/*=======================================================================*/
#include <cmath>
#include "EST_multistats.h"
#include "EST_simplestats.h"
static void ols_load_selected_feats(const EST_FMatrix &X,
const EST_IVector &included,
EST_FMatrix &Xl);
static int ols_stepwise_find_best(const EST_FMatrix &X,
const EST_FMatrix &Y,
EST_IVector &included,
EST_FMatrix &coeffs,
float &bscore,
int &best_feat,
const EST_FMatrix &Xtest,
const EST_FMatrix &Ytest,
const EST_StrList &feat_names
);
int ols(const EST_FMatrix &X,const EST_FMatrix &Y, EST_FMatrix &coeffs)
{
// Ordinary least squares, X contains the samples with 1 (for intercept)
// in column 0, Y contains the single values.
EST_FMatrix Xplus;
if (!pseudo_inverse(X,Xplus))
return FALSE;
multiply(Xplus,Y,coeffs);
return TRUE;
}
int robust_ols(const EST_FMatrix &X,
const EST_FMatrix &Y,
EST_FMatrix &coeffs)
{
EST_IVector included;
int i;
included.resize(X.num_columns());
for (i=0; i<included.length(); i++)
included.a_no_check(i) = TRUE;
return robust_ols(X,Y,included,coeffs);
}
int robust_ols(const EST_FMatrix &X,
const EST_FMatrix &Y,
EST_IVector &included,
EST_FMatrix &coeffs)
{
// as with ols but if the pseudo inverse fails remove the offending
// features and try again until it works, this can be costly but
// its saves *you* from finding the singularity
// This expands the output and puts weights of 0 for omitted features
EST_FMatrix Xl;
EST_FMatrix coeffsl;
EST_FMatrix Xplus;
int i,j,singularity=-1;
if (X.num_rows() <= X.num_columns())
{
cerr << "OLS: less rows than columns, so cannot find solution."
<< endl;
return FALSE;
}
if (X.num_columns() != included.length())
{
cerr << "OLS: `included' list wrong size: internal error."
<< endl;
return FALSE;
}
while (TRUE)
{
ols_load_selected_feats(X,included,Xl);
if (pseudo_inverse(Xl,Xplus,singularity))
{
multiply(Xplus,Y,coeffsl);
break;
}
else
{ // found a singularity so try again without that column
// remap singularity position back to X
int s;
for (s=i=0; i<singularity; i++)
{
s++;
while ((included(s) == FALSE) ||
(included(s) == OLS_IGNORE))
s++;
}
if (included(s) == FALSE)
{ // oops
cerr << "OLS: found singularity twice, shouldn't happen"
<< endl;
return FALSE;
}
else
{
cerr << "OLS: omitting singularity in column " << s << endl;
included[s] = FALSE;
}
}
}
// Map coefficients back, making coefficient 0 for singular cols
coeffs.resize(X.num_columns(),1);
for (j=i=0; i<X.num_columns(); i++)
if (included(i))
{
coeffs.a_no_check(i,0) = coeffsl(j,0);
j++;
}
else
coeffs.a_no_check(i,0) = 0.0;
return TRUE;
}
static void ols_load_selected_feats(const EST_FMatrix &X,
const EST_IVector &included,
EST_FMatrix &Xl)
{
int i,j,k,width;
for (width=i=0; i<included.length(); i++)
if (included(i) == TRUE)
width++;
Xl.resize(X.num_rows(),width);
for (i=0; i<X.num_rows(); i++)
for (k=j=0; j < X.num_columns(); j++)
if (included(j) == TRUE)
{
Xl.a_no_check(i,k) = X.a_no_check(i,j);
k++;
}
}
int ols_apply(const EST_FMatrix &samples,
const EST_FMatrix &coeffs,
EST_FMatrix &res)
{
// Apply coefficients to samples for res.
if (samples.num_columns() != coeffs.num_rows())
return FALSE;
multiply(samples,coeffs,res);
return TRUE;
}
int stepwise_ols(const EST_FMatrix &X,
const EST_FMatrix &Y,
const EST_StrList &feat_names,
float limit,
EST_FMatrix &coeffs,
const EST_FMatrix &Xtest,
const EST_FMatrix &Ytest,
EST_IVector &included)
{
// Find the features that contribute to the correlation using a
// a greedy algorithm
EST_FMatrix coeffsl;
float best_score=0.0,bscore;
int i,best_feat;
int nf=1; // for nice printing of progress
for (i=1; i < X.num_columns(); i++)
{
if (!ols_stepwise_find_best(X,Y,included,coeffsl,
bscore,best_feat,Xtest,Ytest,
feat_names))
{
cerr << "OLS: stepwise failed" << endl;
return FALSE;
}
if ((bscore - (bscore * (limit/100))) <= best_score)
break;
else
{
best_score = bscore;
coeffs = coeffsl;
included[best_feat] = TRUE;
printf("FEATURE %d %s: %2.4f\n",
nf,
(const char *)feat_names.nth(best_feat),
best_score);
fflush(stdout);
nf++;
}
}
return TRUE;
}
static int ols_stepwise_find_best(const EST_FMatrix &X,
const EST_FMatrix &Y,
EST_IVector &included,
EST_FMatrix &coeffs,
float &bscore,
int &best_feat,
const EST_FMatrix &Xtest,
const EST_FMatrix &Ytest,
const EST_StrList &feat_names
)
{
EST_FMatrix coeffsl;
bscore = 0;
best_feat = -1;
int i;
for (i=0; i < included.length(); i++)
{
if (included.a_no_check(i) == FALSE)
{
float cor, rmse;
EST_FMatrix pred;
included.a_no_check(i) = TRUE;
if (!robust_ols(X,Y,included,coeffsl))
return FALSE; // failed for some reason
ols_apply(Xtest,coeffsl,pred);
ols_test(Ytest,pred,cor,rmse);
printf("tested %d %s %f best %f\n",
i,(const char *)feat_names.nth(i),cor,bscore);
if (fabs(cor) > bscore)
{
bscore = fabs(cor);
coeffs = coeffsl;
best_feat = i;
}
included.a_no_check(i) = FALSE;
}
}
return TRUE;
}
int ols_test(const EST_FMatrix &real,
const EST_FMatrix &predicted,
float &correlation,
float &rmse)
{
// Others probably want this function too
// return correlation and RMSE for col 0 in real and predicted
int i;
float p,r;
EST_SuffStats x,y,xx,yy,xy,se,e;
double error;
double v1,v2,v3;
if (real.num_rows() != predicted.num_rows())
return FALSE; // can't do this
for (i=0; i < real.num_rows(); i++)
{
r = real(i,0);
p = predicted(i,0);
x += p;
y += r;
error = p-r;
se += error*error;
e += fabs(error);
xx += p*p;
yy += r*r;
xy += p*r;
}
rmse = sqrt(se.mean());
v1 = xx.mean()-(x.mean()*x.mean());
v2 = yy.mean()-(y.mean()*y.mean());
v3 = v1*v2;
if (v3 <= 0)
{ // happens when there's very little variation in x
correlation = 0;
rmse = se.mean();
return FALSE;
}
// Pearson's product moment correlation coefficient
correlation = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
// I hate to have to do this but it is necessary.
// When the the variation of X is very small v1*v2 approaches
// 0 (the negative and equals case is caught above) but that
// may not be enough when v1 or v2 are very small but positive.
// So I catch it here. If I knew more math I'd be able to describe
// this better but the code would remain the same.
if ((correlation <= 1.0) && (correlation >= -1.0))
return TRUE;
else
{
correlation = 0;
return FALSE;
}
}