speech-tools/grammar/scfg/EST_SCFG_Chart.cc
2015-09-19 10:52:26 +02:00

570 lines
15 KiB
C++

/*************************************************************************/
/* */
/* Centre for Speech Technology Research */
/* University of Edinburgh, UK */
/* Copyright (c) 1997 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author : Alan W Black */
/* Date : June 1997 */
/*-----------------------------------------------------------------------*/
/* */
/* A SCFG chart parser */
/* */
/*=======================================================================*/
#include <cstdlib>
#include "siod.h"
#include "EST_math.h"
#include "EST_SCFG.h"
#include "EST_SCFG_Chart.h"
EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge()
{
}
EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge(double prob,
int d1, int d2,
int pos)
{
p_d1 = d1;
p_d2 = d2;
p_pos = pos;
p_prob = prob;
}
EST_SCFG_Chart_Edge::~EST_SCFG_Chart_Edge()
{
}
LISP EST_SCFG_Chart::print_edge(int start, int end, int p,
EST_SCFG_Chart_Edge *e)
{
// Return a lisp representation of the edge
if (e->prob() == 0)
{
return NIL; // failed
}
else if (start+1 == end)
{
// unary rule, preterminal
LISP r = cons(rintern(grammar->nonterminal(p)),
cons(flocons(e->prob()),
cons(flocons(start),
cons(flocons(end),
cons(rintern(grammar->terminal(e->d1())),
NIL)))));
return r;
}
else
{
//name prob start end daughters
EST_SCFG_Chart_Edge *d1, *d2;
d1 = edges[start][e->pos()][e->d1()];
d2 = edges[e->pos()][end][e->d2()];
LISP daughters =
cons(print_edge(start,e->pos(),e->d1(),d1),
cons(print_edge(e->pos(),end,e->d2(),d2),
NIL));
LISP r = cons(rintern(grammar->nonterminal(p)),
cons(flocons(e->prob()),
cons(flocons(start),
cons(flocons(end),
daughters))));
return r;
}
}
EST_SCFG_Chart::EST_SCFG_Chart()
{
n_vertices = 0;
edges = 0;
wfst = 0;
emptyedge = 0;
grammar_local = TRUE;
grammar = new EST_SCFG;
}
EST_SCFG_Chart::~EST_SCFG_Chart()
{
// delete all the vertices
delete_edge_table();
if (grammar_local)
delete grammar;
}
void EST_SCFG_Chart::set_grammar_rules(EST_SCFG &imported_grammar)
{
if (grammar_local)
delete grammar;
grammar_local = FALSE;
grammar = &imported_grammar;
}
void EST_SCFG_Chart::set_grammar_rules(LISP r)
{
grammar->set_rules(r);
}
void EST_SCFG_Chart::setup_wfst(EST_Relation *s,const EST_String &name)
{
// Set up well formed substring table from feature name in each
// stream item in s
setup_wfst(s->head(),0,name);
}
void EST_SCFG_Chart::setup_wfst(EST_Item *s, EST_Item *e,const EST_String &name)
{
// Set up well formed substring table from feature name in each
// stream item in s
EST_Item *p;
int n;
delete_edge_table();
for (n_vertices=1,p=s; p != e; p=p->next())
n_vertices++;
setup_edge_table();
for (n=0,p=s; p != e; p=p->next(),n++)
{
int term = grammar->terminal(p->f(name).string());
if (term == -1)
{
cerr << "SCFG_Chart: unknown terminal symbol \"" <<
p->f(name).string() << "\"" << endl;
term = 0; // to avoid crashing
}
wfst[n] = new EST_SCFG_Chart_Edge(1.0,term,0,-1);
}
}
void EST_SCFG_Chart::delete_edge_table()
{
int i,j,k;
if (wfst == 0) return;
for (i=0; i < n_vertices; i++)
{
delete wfst[i];
for (j=0; j < n_vertices; j++)
{
for (k=0; k < grammar->num_nonterminals(); k++)
if (edges[i][j][k] != emptyedge)
delete edges[i][j][k];
delete [] edges[i][j];
}
delete [] edges[i];
}
delete [] wfst;
delete [] edges;
delete emptyedge;
wfst = 0;
edges = 0;
}
void EST_SCFG_Chart::setup_edge_table()
{
int i,j,k;
int nt = grammar->num_nonterminals();
wfst = new EST_SCFG_Chart_Edge*[n_vertices];
edges = new EST_SCFG_Chart_Edge ***[n_vertices];
emptyedge = new EST_SCFG_Chart_Edge(0,0,0,0);
for (i=0; i < n_vertices; i++)
{
wfst[i] = 0;
edges[i] = new EST_SCFG_Chart_Edge **[n_vertices];
for (j=0; j < n_vertices; j++)
{
edges[i][j] = new EST_SCFG_Chart_Edge *[nt];
for (k=0; k < nt; k++)
edges[i][j][k] = 0;
}
}
}
double EST_SCFG_Chart::find_best_tree_cal(int start,int end,int p)
{
// Find the best parse for non/terminal p between start and end
int best_j = -1;
int best_q = -1, best_r = -1;
double best_prob = 0;
if (end-1 == start)
{
best_prob = grammar->prob_U(p,wfst[start]->d1());
if (best_prob > 0)
edges[start][end][p] =
new EST_SCFG_Chart_Edge(best_prob*wfst[start]->prob(),
wfst[start]->d1(),0,-1);
else
edges[start][end][p] = emptyedge;
return best_prob;
}
else
{
// for all rules expanding p find total and best prob
double s=0,t_prob,left,right;
int j,q,r;
int nt = grammar->num_nonterminals();
for (q=0; q < nt; q++)
for (r=0; r < nt; r++)
{
double pBpqr = grammar->prob_B(p,q,r);
if (pBpqr > 0)
{
for (j=start+1; j < end; j++)
{
left=find_best_tree(start,j,q);
if (left > 0)
{
right = find_best_tree(j,end,r);
t_prob = pBpqr * left * right;
if (t_prob > best_prob)
{
best_prob = t_prob;
best_q = q;
best_r = r;
best_j = j;
}
s += t_prob;
}
}
}
}
if (best_prob > 0)
edges[start][end][p] =
new EST_SCFG_Chart_Edge(s,best_q,best_r,best_j);
else
edges[start][end][p] = emptyedge;
return s;
}
}
void EST_SCFG_Chart::parse(void)
{
// do the parsing, find best parse only
if (n_vertices - 1 > 0)
find_best_tree(0,n_vertices-1,grammar->distinguished_symbol());
}
LISP EST_SCFG_Chart::find_parse()
{
// Extract the parse from the edge table
EST_SCFG_Chart_Edge *top;
top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
if (top == 0)
return NIL; // no parse
else
return print_edge(0,n_vertices-1,grammar->distinguished_symbol(),top);
}
void EST_SCFG_Chart::extract_parse(EST_Relation *syn,
EST_Relation *word,
int force)
{
// Build a tree stream in Syn linking the leafs of Syn to those
// in word, force guarantees a parse is necessary (though probably
// a random one)
extract_parse(syn,word->head(),0,force);
}
void EST_SCFG_Chart::extract_parse(EST_Relation *syn,
EST_Item *s, EST_Item *e, int force)
{
// Build a tree stream in Syn linking the leafs of Syn to those
// in word
EST_Item *p;
int num_words;
for (num_words=0,p=s; p != e; p=p->next())
num_words++;
if (num_words != (n_vertices-1))
{
cerr << "SCFG_Chart: extract_parse, number of items in link stream " <<
" different from those in parse tree" << endl;
return;
}
EST_SCFG_Chart_Edge *top;
EST_Item *w = s;
top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
if (top == 0)
return; // failed to parse so no parse tree
else
{
EST_Item *ss = syn->append();
extract_edge(0,n_vertices-1,grammar->distinguished_symbol(),top,
ss,&w);
if ((force) && (!daughter1(ss))) // no parse found but *need* one
extract_forced_parse(0,n_vertices-1,ss,w);
return;
}
}
void EST_SCFG_Chart::extract_forced_parse(int start, int end,
EST_Item *s, EST_Item *w)
{
// Extract a parse even though one wasn't found (often happens
// with single word or dual word sentences.
if (start+1 == end)
{
s->append_daughter(w);
s->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
s->set("prob",0.0); // maybe should be epsilon
}
else
{
extract_forced_parse(start,start+1,s->append_daughter(),w);
EST_Item *st = s->append_daughter();
st->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
st->set("prob",0.0); // maybe should be epsilon
EST_Item *nw = w->next();
extract_forced_parse(start+1,end,st,nw);
}
}
void EST_SCFG_Chart::extract_edge(int start, int end, int p,
EST_SCFG_Chart_Edge *e,
EST_Item *s,
EST_Item **word)
{
// Build the node for this edge, and all of its daughters
if (e->prob() == 0)
{
return; // failed
}
else if (start+1 == end)
{
// unary rule, preterminal
s->append_daughter((*word));
s->set_name(grammar->nonterminal(p));
s->set("prob",(float)e->prob());
*word = (*word)->next(); // increment along "word" stream
return;
}
else
{
//name prob start end daughters
EST_SCFG_Chart_Edge *d1, *d2;
d1 = edges[start][e->pos()][e->d1()];
d2 = edges[e->pos()][end][e->d2()];
// Inserts the new nodes in the tree (and creates new si nodes)
s->append_daughter();
s->append_daughter();
extract_edge(start,e->pos(),e->d1(),d1,daughter1(s),word);
extract_edge(e->pos(),end,e->d2(),d2,daughter2(s),word);
s->set_name(grammar->nonterminal(p));
s->set("prob",(float)e->prob());
return;
}
}
void EST_SCFG_chart_load_relation(EST_Relation &s,LISP sent)
{
// Set up well formed substring table form lisp list
// Setup a relation and call the standard method of set up
LISP w,f;
for (w=sent; w != NIL; w=cdr(w))
{
EST_Item *word = s.append();
if (consp(car(w)))
{ // a word with other feature info
word->set_name(get_c_string(car(car(w))));
if (consp(car(cdr(car(w)))))
for (f=car(cdr(car(w))); f != NIL; f=cdr(f))
{
if (FLONUMP(car(cdr(car(f)))))
word->set(get_c_string(car(car(f))),
get_c_float(car(cdr(car(f)))));
else
word->set(get_c_string(car(car(f))),
get_c_string(car(cdr(car(f)))));
}
else // we assume its a POS value, cause they didn't say
word->set("name",get_c_string(car(cdr(car(w)))));
}
else // for easy we set the pos field to the be the name
word->set("name",get_c_string(car(w)));
}
}
void scfg_parse(EST_Relation *Word, const EST_String &name,
EST_Relation *Syntax, EST_SCFG &grammar)
{
// Parse feature name in Word to build Syntax relation
// The relations names above are *not* the names of the relations
// just named to reflect there conceptual usage
EST_SCFG_Chart chart;
chart.set_grammar_rules(grammar);
chart.setup_wfst(Word,name);
chart.parse();
chart.extract_parse(Syntax,Word,TRUE);
return;
}
LISP scfg_parse(LISP string, LISP grammar)
{
// Parse and return full parse
EST_SCFG_Chart chart;
EST_Relation words;
LISP parse;
chart.set_grammar_rules(grammar);
EST_SCFG_chart_load_relation(words,string);
chart.setup_wfst(&words,"name");
chart.parse();
parse = chart.find_parse();
return parse;
}
LISP scfg_parse(LISP string, EST_SCFG &grammar)
{
// Parse and return full parse
EST_SCFG_Chart chart;
EST_Relation words;
LISP parse;
chart.set_grammar_rules(grammar);
EST_SCFG_chart_load_relation(words,string);
chart.setup_wfst(&words,"name");
chart.parse();
parse = chart.find_parse();
return parse;
}
LISP scfg_bracketing_only(LISP parse)
{
if (consp(siod_nth(4,parse)))
{
LISP d,ds;
for (d=cdr(cdr(cdr(cdr(parse)))),ds=NIL; d != NIL; d=cdr(d))
ds = cons(scfg_bracketing_only(car(d)),ds);
return reverse(ds);
}
else
return siod_nth(4,parse);
}
void EST_SCFG_traintest::test_crossbrackets()
{
// Compare bracketing of best parse to bracketing on original
// For each sentence parse it (unbracketed) and then
// find the percentage of valid brackets in parsed version that
// are valid in the original one.
int c;
LISP parse;
EST_SuffStats cb;
int failed = 0;
int fully_contained=0;
for (c=0; c < corpus.length(); c++)
{
LISP flat = siod_flatten(corpus.a_no_check(c).string());
parse = scfg_parse(flat,*this);
if (parse == NIL)
{
failed++;
continue;
}
EST_bracketed_string parsed(scfg_bracketing_only(parse));
EST_SuffStats vs;
count_bracket_crossing(corpus.a_no_check(c),parsed,vs);
if (vs.mean() == 1)
fully_contained++;
cb += vs.mean();
}
cout << "cross bracketing " << cb.mean()*100 << " (" << failed <<
" failed " << (float)(100.0*fully_contained)/corpus.length() <<
"% fully consistent from " << corpus.length()
<< " sentences)" << endl;
}
void count_bracket_crossing(const EST_bracketed_string &ref,
const EST_bracketed_string &test,
EST_SuffStats &vs)
{
int i,j;
if (ref.length() != test.length())
{
EST_error("bracket_crossing: sentences of different lengths");
}
for (i=0; i < ref.length(); i++)
for (j=i+1; j <= ref.length(); j++)
if (test.valid(i,j) == 1)
{
if (ref.valid(i,j) == 0)
vs += 0;
else
vs += 1;
}
}