570 lines
15 KiB
C++
570 lines
15 KiB
C++
/*************************************************************************/
|
|
/* */
|
|
/* Centre for Speech Technology Research */
|
|
/* University of Edinburgh, UK */
|
|
/* Copyright (c) 1997 */
|
|
/* All Rights Reserved. */
|
|
/* */
|
|
/* Permission is hereby granted, free of charge, to use and distribute */
|
|
/* this software and its documentation without restriction, including */
|
|
/* without limitation the rights to use, copy, modify, merge, publish, */
|
|
/* distribute, sublicense, and/or sell copies of this work, and to */
|
|
/* permit persons to whom this work is furnished to do so, subject to */
|
|
/* the following conditions: */
|
|
/* 1. The code must retain the above copyright notice, this list of */
|
|
/* conditions and the following disclaimer. */
|
|
/* 2. Any modifications must be clearly marked as such. */
|
|
/* 3. Original authors' names are not deleted. */
|
|
/* 4. The authors' names are not used to endorse or promote products */
|
|
/* derived from this software without specific prior written */
|
|
/* permission. */
|
|
/* */
|
|
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
|
|
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
|
|
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
|
|
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
|
|
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
|
|
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
|
|
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
|
|
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
|
|
/* THIS SOFTWARE. */
|
|
/* */
|
|
/*************************************************************************/
|
|
/* Author : Alan W Black */
|
|
/* Date : June 1997 */
|
|
/*-----------------------------------------------------------------------*/
|
|
/* */
|
|
/* A SCFG chart parser */
|
|
/* */
|
|
/*=======================================================================*/
|
|
#include <cstdlib>
|
|
#include "siod.h"
|
|
#include "EST_math.h"
|
|
#include "EST_SCFG.h"
|
|
#include "EST_SCFG_Chart.h"
|
|
|
|
EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge()
|
|
{
|
|
}
|
|
|
|
EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge(double prob,
|
|
int d1, int d2,
|
|
int pos)
|
|
{
|
|
p_d1 = d1;
|
|
p_d2 = d2;
|
|
p_pos = pos;
|
|
p_prob = prob;
|
|
}
|
|
|
|
EST_SCFG_Chart_Edge::~EST_SCFG_Chart_Edge()
|
|
{
|
|
}
|
|
|
|
LISP EST_SCFG_Chart::print_edge(int start, int end, int p,
|
|
EST_SCFG_Chart_Edge *e)
|
|
{
|
|
// Return a lisp representation of the edge
|
|
|
|
if (e->prob() == 0)
|
|
{
|
|
return NIL; // failed
|
|
}
|
|
else if (start+1 == end)
|
|
{
|
|
// unary rule, preterminal
|
|
LISP r = cons(rintern(grammar->nonterminal(p)),
|
|
cons(flocons(e->prob()),
|
|
cons(flocons(start),
|
|
cons(flocons(end),
|
|
cons(rintern(grammar->terminal(e->d1())),
|
|
NIL)))));
|
|
return r;
|
|
}
|
|
else
|
|
{
|
|
//name prob start end daughters
|
|
EST_SCFG_Chart_Edge *d1, *d2;
|
|
|
|
d1 = edges[start][e->pos()][e->d1()];
|
|
d2 = edges[e->pos()][end][e->d2()];
|
|
|
|
LISP daughters =
|
|
cons(print_edge(start,e->pos(),e->d1(),d1),
|
|
cons(print_edge(e->pos(),end,e->d2(),d2),
|
|
NIL));
|
|
LISP r = cons(rintern(grammar->nonterminal(p)),
|
|
cons(flocons(e->prob()),
|
|
cons(flocons(start),
|
|
cons(flocons(end),
|
|
daughters))));
|
|
return r;
|
|
}
|
|
}
|
|
|
|
EST_SCFG_Chart::EST_SCFG_Chart()
|
|
{
|
|
n_vertices = 0;
|
|
edges = 0;
|
|
wfst = 0;
|
|
emptyedge = 0;
|
|
grammar_local = TRUE;
|
|
grammar = new EST_SCFG;
|
|
}
|
|
|
|
EST_SCFG_Chart::~EST_SCFG_Chart()
|
|
{
|
|
// delete all the vertices
|
|
|
|
delete_edge_table();
|
|
if (grammar_local)
|
|
delete grammar;
|
|
}
|
|
|
|
void EST_SCFG_Chart::set_grammar_rules(EST_SCFG &imported_grammar)
|
|
{
|
|
if (grammar_local)
|
|
delete grammar;
|
|
grammar_local = FALSE;
|
|
grammar = &imported_grammar;
|
|
}
|
|
|
|
void EST_SCFG_Chart::set_grammar_rules(LISP r)
|
|
{
|
|
grammar->set_rules(r);
|
|
}
|
|
|
|
void EST_SCFG_Chart::setup_wfst(EST_Relation *s,const EST_String &name)
|
|
{
|
|
// Set up well formed substring table from feature name in each
|
|
// stream item in s
|
|
setup_wfst(s->head(),0,name);
|
|
}
|
|
|
|
void EST_SCFG_Chart::setup_wfst(EST_Item *s, EST_Item *e,const EST_String &name)
|
|
{
|
|
// Set up well formed substring table from feature name in each
|
|
// stream item in s
|
|
EST_Item *p;
|
|
int n;
|
|
|
|
delete_edge_table();
|
|
for (n_vertices=1,p=s; p != e; p=p->next())
|
|
n_vertices++;
|
|
setup_edge_table();
|
|
|
|
for (n=0,p=s; p != e; p=p->next(),n++)
|
|
{
|
|
int term = grammar->terminal(p->f(name).string());
|
|
if (term == -1)
|
|
{
|
|
cerr << "SCFG_Chart: unknown terminal symbol \"" <<
|
|
p->f(name).string() << "\"" << endl;
|
|
term = 0; // to avoid crashing
|
|
}
|
|
wfst[n] = new EST_SCFG_Chart_Edge(1.0,term,0,-1);
|
|
}
|
|
}
|
|
|
|
void EST_SCFG_Chart::delete_edge_table()
|
|
{
|
|
int i,j,k;
|
|
|
|
if (wfst == 0) return;
|
|
|
|
for (i=0; i < n_vertices; i++)
|
|
{
|
|
delete wfst[i];
|
|
for (j=0; j < n_vertices; j++)
|
|
{
|
|
for (k=0; k < grammar->num_nonterminals(); k++)
|
|
if (edges[i][j][k] != emptyedge)
|
|
delete edges[i][j][k];
|
|
delete [] edges[i][j];
|
|
}
|
|
delete [] edges[i];
|
|
}
|
|
delete [] wfst;
|
|
delete [] edges;
|
|
delete emptyedge;
|
|
|
|
wfst = 0;
|
|
edges = 0;
|
|
|
|
}
|
|
|
|
void EST_SCFG_Chart::setup_edge_table()
|
|
{
|
|
int i,j,k;
|
|
int nt = grammar->num_nonterminals();
|
|
|
|
wfst = new EST_SCFG_Chart_Edge*[n_vertices];
|
|
edges = new EST_SCFG_Chart_Edge ***[n_vertices];
|
|
emptyedge = new EST_SCFG_Chart_Edge(0,0,0,0);
|
|
|
|
for (i=0; i < n_vertices; i++)
|
|
{
|
|
wfst[i] = 0;
|
|
edges[i] = new EST_SCFG_Chart_Edge **[n_vertices];
|
|
for (j=0; j < n_vertices; j++)
|
|
{
|
|
edges[i][j] = new EST_SCFG_Chart_Edge *[nt];
|
|
for (k=0; k < nt; k++)
|
|
edges[i][j][k] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
double EST_SCFG_Chart::find_best_tree_cal(int start,int end,int p)
|
|
{
|
|
// Find the best parse for non/terminal p between start and end
|
|
int best_j = -1;
|
|
int best_q = -1, best_r = -1;
|
|
double best_prob = 0;
|
|
|
|
if (end-1 == start)
|
|
{
|
|
best_prob = grammar->prob_U(p,wfst[start]->d1());
|
|
if (best_prob > 0)
|
|
edges[start][end][p] =
|
|
new EST_SCFG_Chart_Edge(best_prob*wfst[start]->prob(),
|
|
wfst[start]->d1(),0,-1);
|
|
else
|
|
edges[start][end][p] = emptyedge;
|
|
return best_prob;
|
|
}
|
|
else
|
|
{
|
|
// for all rules expanding p find total and best prob
|
|
double s=0,t_prob,left,right;
|
|
int j,q,r;
|
|
int nt = grammar->num_nonterminals();
|
|
|
|
for (q=0; q < nt; q++)
|
|
for (r=0; r < nt; r++)
|
|
{
|
|
double pBpqr = grammar->prob_B(p,q,r);
|
|
if (pBpqr > 0)
|
|
{
|
|
for (j=start+1; j < end; j++)
|
|
{
|
|
left=find_best_tree(start,j,q);
|
|
if (left > 0)
|
|
{
|
|
right = find_best_tree(j,end,r);
|
|
t_prob = pBpqr * left * right;
|
|
if (t_prob > best_prob)
|
|
{
|
|
best_prob = t_prob;
|
|
best_q = q;
|
|
best_r = r;
|
|
best_j = j;
|
|
}
|
|
s += t_prob;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (best_prob > 0)
|
|
edges[start][end][p] =
|
|
new EST_SCFG_Chart_Edge(s,best_q,best_r,best_j);
|
|
else
|
|
edges[start][end][p] = emptyedge;
|
|
return s;
|
|
}
|
|
}
|
|
|
|
void EST_SCFG_Chart::parse(void)
|
|
{
|
|
// do the parsing, find best parse only
|
|
if (n_vertices - 1 > 0)
|
|
find_best_tree(0,n_vertices-1,grammar->distinguished_symbol());
|
|
|
|
}
|
|
|
|
LISP EST_SCFG_Chart::find_parse()
|
|
{
|
|
// Extract the parse from the edge table
|
|
EST_SCFG_Chart_Edge *top;
|
|
|
|
top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
|
|
|
|
if (top == 0)
|
|
return NIL; // no parse
|
|
else
|
|
return print_edge(0,n_vertices-1,grammar->distinguished_symbol(),top);
|
|
}
|
|
|
|
void EST_SCFG_Chart::extract_parse(EST_Relation *syn,
|
|
EST_Relation *word,
|
|
int force)
|
|
{
|
|
// Build a tree stream in Syn linking the leafs of Syn to those
|
|
// in word, force guarantees a parse is necessary (though probably
|
|
// a random one)
|
|
|
|
extract_parse(syn,word->head(),0,force);
|
|
}
|
|
|
|
void EST_SCFG_Chart::extract_parse(EST_Relation *syn,
|
|
EST_Item *s, EST_Item *e, int force)
|
|
{
|
|
// Build a tree stream in Syn linking the leafs of Syn to those
|
|
// in word
|
|
EST_Item *p;
|
|
int num_words;
|
|
|
|
for (num_words=0,p=s; p != e; p=p->next())
|
|
num_words++;
|
|
|
|
if (num_words != (n_vertices-1))
|
|
{
|
|
cerr << "SCFG_Chart: extract_parse, number of items in link stream " <<
|
|
" different from those in parse tree" << endl;
|
|
return;
|
|
}
|
|
|
|
EST_SCFG_Chart_Edge *top;
|
|
EST_Item *w = s;
|
|
|
|
top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
|
|
|
|
if (top == 0)
|
|
return; // failed to parse so no parse tree
|
|
else
|
|
{
|
|
EST_Item *ss = syn->append();
|
|
|
|
extract_edge(0,n_vertices-1,grammar->distinguished_symbol(),top,
|
|
ss,&w);
|
|
|
|
if ((force) && (!daughter1(ss))) // no parse found but *need* one
|
|
extract_forced_parse(0,n_vertices-1,ss,w);
|
|
return;
|
|
}
|
|
}
|
|
|
|
void EST_SCFG_Chart::extract_forced_parse(int start, int end,
|
|
EST_Item *s, EST_Item *w)
|
|
{
|
|
// Extract a parse even though one wasn't found (often happens
|
|
// with single word or dual word sentences.
|
|
|
|
if (start+1 == end)
|
|
{
|
|
s->append_daughter(w);
|
|
s->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
|
|
s->set("prob",0.0); // maybe should be epsilon
|
|
}
|
|
else
|
|
{
|
|
extract_forced_parse(start,start+1,s->append_daughter(),w);
|
|
EST_Item *st = s->append_daughter();
|
|
st->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
|
|
st->set("prob",0.0); // maybe should be epsilon
|
|
EST_Item *nw = w->next();
|
|
extract_forced_parse(start+1,end,st,nw);
|
|
}
|
|
}
|
|
|
|
void EST_SCFG_Chart::extract_edge(int start, int end, int p,
|
|
EST_SCFG_Chart_Edge *e,
|
|
EST_Item *s,
|
|
EST_Item **word)
|
|
{
|
|
// Build the node for this edge, and all of its daughters
|
|
|
|
if (e->prob() == 0)
|
|
{
|
|
return; // failed
|
|
}
|
|
else if (start+1 == end)
|
|
{
|
|
// unary rule, preterminal
|
|
s->append_daughter((*word));
|
|
s->set_name(grammar->nonterminal(p));
|
|
s->set("prob",(float)e->prob());
|
|
*word = (*word)->next(); // increment along "word" stream
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
//name prob start end daughters
|
|
EST_SCFG_Chart_Edge *d1, *d2;
|
|
|
|
d1 = edges[start][e->pos()][e->d1()];
|
|
d2 = edges[e->pos()][end][e->d2()];
|
|
|
|
// Inserts the new nodes in the tree (and creates new si nodes)
|
|
s->append_daughter();
|
|
s->append_daughter();
|
|
|
|
extract_edge(start,e->pos(),e->d1(),d1,daughter1(s),word);
|
|
extract_edge(e->pos(),end,e->d2(),d2,daughter2(s),word);
|
|
|
|
s->set_name(grammar->nonterminal(p));
|
|
s->set("prob",(float)e->prob());
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
void EST_SCFG_chart_load_relation(EST_Relation &s,LISP sent)
|
|
{
|
|
// Set up well formed substring table form lisp list
|
|
// Setup a relation and call the standard method of set up
|
|
LISP w,f;
|
|
|
|
for (w=sent; w != NIL; w=cdr(w))
|
|
{
|
|
EST_Item *word = s.append();
|
|
|
|
if (consp(car(w)))
|
|
{ // a word with other feature info
|
|
word->set_name(get_c_string(car(car(w))));
|
|
if (consp(car(cdr(car(w)))))
|
|
for (f=car(cdr(car(w))); f != NIL; f=cdr(f))
|
|
{
|
|
if (FLONUMP(car(cdr(car(f)))))
|
|
word->set(get_c_string(car(car(f))),
|
|
get_c_float(car(cdr(car(f)))));
|
|
else
|
|
word->set(get_c_string(car(car(f))),
|
|
get_c_string(car(cdr(car(f)))));
|
|
}
|
|
else // we assume its a POS value, cause they didn't say
|
|
word->set("name",get_c_string(car(cdr(car(w)))));
|
|
}
|
|
else // for easy we set the pos field to the be the name
|
|
word->set("name",get_c_string(car(w)));
|
|
}
|
|
}
|
|
|
|
void scfg_parse(EST_Relation *Word, const EST_String &name,
|
|
EST_Relation *Syntax, EST_SCFG &grammar)
|
|
{
|
|
// Parse feature name in Word to build Syntax relation
|
|
// The relations names above are *not* the names of the relations
|
|
// just named to reflect there conceptual usage
|
|
EST_SCFG_Chart chart;
|
|
|
|
chart.set_grammar_rules(grammar);
|
|
chart.setup_wfst(Word,name);
|
|
chart.parse();
|
|
chart.extract_parse(Syntax,Word,TRUE);
|
|
|
|
return;
|
|
}
|
|
|
|
LISP scfg_parse(LISP string, LISP grammar)
|
|
{
|
|
// Parse and return full parse
|
|
EST_SCFG_Chart chart;
|
|
EST_Relation words;
|
|
LISP parse;
|
|
|
|
chart.set_grammar_rules(grammar);
|
|
|
|
EST_SCFG_chart_load_relation(words,string);
|
|
chart.setup_wfst(&words,"name");
|
|
chart.parse();
|
|
parse = chart.find_parse();
|
|
|
|
return parse;
|
|
}
|
|
|
|
LISP scfg_parse(LISP string, EST_SCFG &grammar)
|
|
{
|
|
// Parse and return full parse
|
|
EST_SCFG_Chart chart;
|
|
EST_Relation words;
|
|
LISP parse;
|
|
|
|
chart.set_grammar_rules(grammar);
|
|
|
|
EST_SCFG_chart_load_relation(words,string);
|
|
chart.setup_wfst(&words,"name");
|
|
chart.parse();
|
|
parse = chart.find_parse();
|
|
|
|
return parse;
|
|
}
|
|
|
|
LISP scfg_bracketing_only(LISP parse)
|
|
{
|
|
if (consp(siod_nth(4,parse)))
|
|
{
|
|
LISP d,ds;
|
|
|
|
for (d=cdr(cdr(cdr(cdr(parse)))),ds=NIL; d != NIL; d=cdr(d))
|
|
ds = cons(scfg_bracketing_only(car(d)),ds);
|
|
return reverse(ds);
|
|
}
|
|
else
|
|
return siod_nth(4,parse);
|
|
|
|
}
|
|
|
|
void EST_SCFG_traintest::test_crossbrackets()
|
|
{
|
|
// Compare bracketing of best parse to bracketing on original
|
|
// For each sentence parse it (unbracketed) and then
|
|
// find the percentage of valid brackets in parsed version that
|
|
// are valid in the original one.
|
|
int c;
|
|
LISP parse;
|
|
EST_SuffStats cb;
|
|
int failed = 0;
|
|
int fully_contained=0;
|
|
|
|
for (c=0; c < corpus.length(); c++)
|
|
{
|
|
LISP flat = siod_flatten(corpus.a_no_check(c).string());
|
|
|
|
parse = scfg_parse(flat,*this);
|
|
if (parse == NIL)
|
|
{
|
|
failed++;
|
|
continue;
|
|
}
|
|
|
|
EST_bracketed_string parsed(scfg_bracketing_only(parse));
|
|
EST_SuffStats vs;
|
|
|
|
count_bracket_crossing(corpus.a_no_check(c),parsed,vs);
|
|
|
|
if (vs.mean() == 1)
|
|
fully_contained++;
|
|
cb += vs.mean();
|
|
}
|
|
|
|
cout << "cross bracketing " << cb.mean()*100 << " (" << failed <<
|
|
" failed " << (float)(100.0*fully_contained)/corpus.length() <<
|
|
"% fully consistent from " << corpus.length()
|
|
<< " sentences)" << endl;
|
|
|
|
}
|
|
|
|
void count_bracket_crossing(const EST_bracketed_string &ref,
|
|
const EST_bracketed_string &test,
|
|
EST_SuffStats &vs)
|
|
{
|
|
int i,j;
|
|
|
|
if (ref.length() != test.length())
|
|
{
|
|
EST_error("bracket_crossing: sentences of different lengths");
|
|
}
|
|
|
|
for (i=0; i < ref.length(); i++)
|
|
for (j=i+1; j <= ref.length(); j++)
|
|
if (test.valid(i,j) == 1)
|
|
{
|
|
if (ref.valid(i,j) == 0)
|
|
vs += 0;
|
|
else
|
|
vs += 1;
|
|
}
|
|
}
|