speech-tools/include/EST_Ngrammar.h

609 lines
23 KiB
C
Raw Permalink Normal View History

2015-09-19 10:52:26 +02:00
/*************************************************************************/
/* */
/* Centre for Speech Technology Research */
/* University of Edinburgh, UK */
/* Copyright (c) 1996 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author : Simon King & Alan W Black */
/* Date : February 1997 */
/*-----------------------------------------------------------------------*/
/* */
/* A general class for ngrams (bi-gram, tri-gram etc) */
/* */
/*=======================================================================*/
#ifndef __EST_NGRAMMAR_H__
#define __EST_NGRAMMAR_H__
#include <cstdarg>
#include <cstdlib>
using namespace std;
#include "EST_String.h"
#include "EST_Val.h"
#include "EST_rw_status.h"
#include "EST_types.h"
#include "EST_FMatrix.h"
#include "EST_TList.h"
#include "EST_StringTrie.h"
#include "EST_simplestats.h"
#include "EST_PST.h"
#include "EST_string_aux.h"
#include "EST_math.h"
// HTK style
#define SENTENCE_START_MARKER "!ENTER"
#define SENTENCE_END_MARKER "!EXIT"
#define OOV_MARKER "!OOV"
#define EST_NGRAMBIN_MAGIC 1315402337
// for compressed save/load
#define GZIP_FILENAME_EXTENSION "gz"
#define COMPRESS_FILENAME_EXTENSION "Z"
// Ultimate floor
#define TINY_FREQ 1.0e-10
// ngram state - represents the N-1 word history and contains
// the pdf of the next word
class EST_NgrammarState {
private:
protected:
EST_DiscreteProbDistribution p_pdf;
int p_id; // a 'name'
public:
EST_NgrammarState() :
p_pdf()
{
init();
};
EST_NgrammarState(int id,EST_Discrete *d){clear();init(id,d);};
EST_NgrammarState(int id,const EST_DiscreteProbDistribution &pdf)
{clear();init(id,pdf);};
EST_NgrammarState(const EST_NgrammarState &s);
EST_NgrammarState(const EST_NgrammarState *const s);
~EST_NgrammarState();
EST_IVector path; // how we got here
// initialise
void clear();
void init();
void init(int id, EST_Discrete *d);
void init(int id, const EST_DiscreteProbDistribution &pdf);
// build
void cumulate(const int index, const double count=1)
{p_pdf.cumulate(index,count);};
void cumulate(const EST_String &word, const double count=1)
{p_pdf.cumulate(word,count);};
// access
int id() const {return p_id; };
const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
EST_DiscreteProbDistribution &pdf() {return p_pdf; };
double probability(const EST_String &w) const
{return p_pdf.probability(w);}
double probability(int w) const {return p_pdf.probability(w);}
double frequency(const EST_String &w) const
{return p_pdf.frequency(w);}
double frequency(int w) const {return p_pdf.frequency(w);}
const EST_String &most_probable(double *prob = NULL) const
{return p_pdf.most_probable(prob);}
friend ostream& operator<<(ostream& s, const EST_NgrammarState &a);
};
class EST_BackoffNgrammarState {
private:
protected:
int p_level; // = 0 for root node
double backoff_weight;
EST_DiscreteProbDistribution p_pdf;
EST_StringTrie children;
EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
const EST_StrVector &words);
EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
const EST_IVector &words);
public:
EST_BackoffNgrammarState()
{ init(); };
EST_BackoffNgrammarState(const EST_Discrete *d,int level)
{clear();init(d,level);};
EST_BackoffNgrammarState(const EST_DiscreteProbDistribution &pdf,int level)
{clear();init(pdf,level);};
EST_BackoffNgrammarState(const EST_BackoffNgrammarState &s);
EST_BackoffNgrammarState(const EST_BackoffNgrammarState *const s);
~EST_BackoffNgrammarState();
// initialise
void clear();
void init();
void init(const EST_Discrete *d, int level);
void init(const EST_DiscreteProbDistribution &pdf, int level);
// build
bool accumulate(const EST_StrVector &words,
const double count=1);
bool accumulate(const EST_IVector &words,
const double count=1);
// access
const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
EST_DiscreteProbDistribution &pdf() {return p_pdf; };
double probability(const EST_String &w) const
{return p_pdf.probability(w);}
double frequency(const EST_String &w) const
{return p_pdf.frequency(w);}
const EST_String &most_probable(double *prob = NULL) const
{return p_pdf.most_probable(prob);}
const int level() const {return p_level;}
EST_BackoffNgrammarState* get_child(const EST_String &word) const
{
return (EST_BackoffNgrammarState*)children.lookup(word);
}
EST_BackoffNgrammarState* get_child(const int word) const
{
return (EST_BackoffNgrammarState*)children.lookup(p_pdf.get_discrete()->name(word));
}
void remove_child(EST_BackoffNgrammarState *child,
const EST_String &name);
// recursive delete of contents and children
void zap();
const EST_BackoffNgrammarState *const get_state(const EST_StrVector &words) const;
bool ngram_exists(const EST_StrVector &words,
const double threshold) const;
const double get_backoff_weight() const {return backoff_weight; }
const double get_backoff_weight(const EST_StrVector &words) const;
bool set_backoff_weight(const EST_StrVector &words, const double w);
void frequency_of_frequencies(EST_DVector &ff);
void print_freqs(ostream &os,const int order,EST_String followers="");
friend ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a);
};
class EST_Ngrammar {
public:
// 3 representations : sparse, dense and backed off. User specifies which.
enum representation_t {sparse, dense, backoff};
// now only keep frequencies (or log frequencies)
// probabilities (or log probabilities) can be done
// on the fly quickly enough
enum entry_t {frequencies, log_frequencies};
protected:
// each instance of an EST_Ngrammar is a grammar of fixed order
// e.g. a bigram (order = 2)
int p_order;
int p_num_samples;
double p_number_of_sentences; // which were used to build this grammar
EST_String p_sentence_start_marker;
EST_String p_sentence_end_marker;
// only one representation in use at a time
representation_t p_representation;
entry_t p_entry_type;
// sparse representation is a tree structure
// holding only those N-grams which were seen
EST_PredictionSuffixTree sparse_representation;
bool init_sparse_representation();
// dense representation is just an array of all states
bool init_dense_representation();
// backoff representation is also a tree structure
// but the root state pdf is the most recent word in the
// ngram and going down the tree is going back in time....
// here is the root node :
EST_BackoffNgrammarState *backoff_representation;
double backoff_threshold;
// need a non-zero unigram floor to enable backing off
double backoff_unigram_floor_freq;
// instead of simple discounting, we have a (possibly) different
// discount per order and per frequency
// e.g. backoff_discount[2](4) contains the discount to be
// applied to a trigram frequency of 4
// backoff_discount[0] is unused (we don't discount unigrams)
EST_DVector *backoff_discount;
const double get_backoff_discount(const int order, const double freq) const;
bool init_backoff_representation();
void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL); // remove any zero frequency branches
void backoff_restore_unigram_states();
int p_num_states; // == p_vocab_size ^ (p_ord-1) if fully dense
EST_NgrammarState *p_states; // state id is index into this array
int find_dense_state_index(const EST_IVector &words, int index=0) const;
// and the reverse
const EST_StrVector &make_ngram_from_index(const int i) const;
// vocabulary
EST_Discrete *vocab;
EST_Discrete *pred_vocab; // may be different from state vocab
bool init_vocab(const EST_StrList &wordlist);
bool init_vocab(const EST_StrList &word_list,
const EST_StrList &pred_list);
// make sure vocab matches a given wordlist
bool check_vocab(const EST_StrList &wordlist);
EST_DiscreteProbDistribution vocab_pdf; // overall pdf
const EST_String &lastword(const EST_StrVector &words) const
{ return words(p_order-1); }
const int lastword(const EST_IVector &words) const
{ return words(p_order-1); }
// are we allowing out-of-vocabulary words, or is the vocabulary closed?
bool allow_oov;
bool sparse_to_dense();
bool dense_to_sparse();
// these aren't sorted yet ...
void take_logs();
void take_exps();
void freqs_to_probs(); // just calls normalise
bool build_sparse(const EST_String &filename,
const EST_String &prev,
const EST_String &prev_prev,
const EST_String &last);
// for dense and backoff
bool build_ngram(const EST_String &filename,
const EST_String &prev,
const EST_String &prev_prev,
const EST_String &last,
const EST_String &input_format);
// go through all matching ngrams ( *(ngram[i])="" matches anything )
void iterate(EST_StrVector &words,
void (*function)(EST_Ngrammar *n,
EST_StrVector &words,
void *params),
void *params);
// same, but with a constant Ngrammar
void const_iterate(EST_StrVector &words,
void (*function)(const EST_Ngrammar *const n,
EST_StrVector &words,
void *params),
void *params) const;
bool p_init(int o, representation_t r);
// new filename returned of we had to copy stdin to a
// temporary file - must delete it later !
bool oov_preprocess(const EST_String &filename,
EST_String &new_filename,
const EST_String &what);
const EST_NgrammarState &find_state_const(const EST_StrVector &words)const;
EST_NgrammarState &find_state(const EST_StrVector &words);
const EST_NgrammarState &find_state_const(const EST_IVector &words) const;
EST_NgrammarState &find_state(const EST_IVector &words);
// special versions for backoff grammars
const EST_DiscreteProbDistribution &backoff_prob_dist(const EST_StrVector &words) const;
const double backoff_reverse_probability_sub(const EST_StrVector &words,
const EST_BackoffNgrammarState *root) const;
const double backoff_probability(const EST_StrVector &words,
const bool trace=false) const;
const double backoff_reverse_probability(const EST_StrVector &words) const;
const EST_String & backoff_most_probable(const EST_StrVector &words,
double *prob = NULL) const;
// backoff representation isn't a nice array of states
// so use this to visit every node in the tree
// and apply the function to that node
void backoff_traverse(EST_BackoffNgrammarState *start_state,
void (*function)(EST_BackoffNgrammarState *s,
void *params),
void *params);
// visit every node at a given level
void backoff_traverse(EST_BackoffNgrammarState *start_state,
void (*function)(EST_BackoffNgrammarState *s,
void *params),
void *params, const int level);
public:
EST_Ngrammar() {default_values();}
EST_Ngrammar(int o, representation_t r,
const EST_StrList &wordlist)
{
default_values(); init(o,r,wordlist);
}
// When state trans vocab differs from prediction vocab
EST_Ngrammar(int o, representation_t r,
const EST_StrList &wordlist,
const EST_StrList &predlist)
{
default_values(); init(o,r,wordlist,predlist);
}
EST_Ngrammar(int o, representation_t r, EST_Discrete &v)
{
default_values(); init(o,r,v);
}
~EST_Ngrammar();
void default_values();
void clear();
bool init(int o, representation_t r,
const EST_StrList &wordlist);
bool init(int o, representation_t r,
const EST_StrList &wordlist,
const EST_StrList &predlist);
bool init(int o, representation_t r, EST_Discrete &v);
bool init(int o, representation_t r,
EST_Discrete &v,EST_Discrete &pv);
// access
int num_states(void) const { return p_num_states;}
double samples(void) const { return p_num_samples;}
int order() const { return p_order; }
int get_vocab_length() const { return vocab?vocab->length():0; }
EST_String get_vocab_word(int i) const;
int get_vocab_word(const EST_String &s) const;
int get_pred_vocab_length() const { return pred_vocab->length(); }
EST_String get_pred_vocab_word(int i) const { return pred_vocab->name(i); }
int get_pred_vocab_word(const EST_String &s) const
{ return pred_vocab->name(s); }
int closed_vocab() const {return !allow_oov; }
entry_t entry_type() const {return p_entry_type;}
representation_t representation() const
{ return p_representation;}
// build
bool build(const EST_StrList &filenames,
const EST_String &prev = SENTENCE_START_MARKER,
const EST_String &prev_prev = SENTENCE_END_MARKER,
const EST_String &last = SENTENCE_END_MARKER,
const EST_String &input_format = "",
const EST_String &oov_mode = "",
const int mincount=1,
const int maxcount=10);
// Accumulate ngrams
void accumulate(const EST_StrVector &words,
const double count=1);
//const int index=0);
void accumulate(const EST_IVector &words,
const double count=1);
//const int index=0);
// hack - fix enter/exit probs s.t. P(...,!ENTER)=P(!EXIT,...)=0, for all x
void make_htk_compatible();
// I/O functions
EST_read_status load(const EST_String &filename);
EST_read_status load(const EST_String &filename, const EST_StrList &wordlist);
EST_write_status save(const EST_String &filename,
const EST_String type="cstr_ascii",
const bool trace=false,
double floor=0.0);
int wordlist_index(const EST_String &word, const bool report=true) const;
const EST_String &wordlist_index(int i) const;
int predlist_index(const EST_String &word) const;
const EST_String &predlist_index(int i) const;
// set
bool set_entry_type(entry_t new_type);
bool set_representation(representation_t new_representation);
// probability distributions
// -------------------------
// flag 'force' forces computation of probs on-the-fly if necessary
double probability(const EST_StrVector &words, bool force=false,
const bool trace=false) const;
double frequency(const EST_StrVector &words, bool force=false,
const bool trace=false) const;
const EST_String &predict(const EST_StrVector &words,
double *prob,int *state) const;
const EST_String &predict(const EST_StrVector &words) const
{double p; int state; return predict(words,&p,&state); }
const EST_String &predict(const EST_StrVector &words,double *prob) const
{int state; return predict(words,prob,&state); }
const EST_String &predict(const EST_IVector &words,double *prob,int *state) const;
const EST_String &predict(const EST_IVector &words) const
{double p; int state; return predict(words,&p,&state); }
const EST_String &predict(const EST_IVector &words,double *prob) const
{int state; return predict(words,prob,&state); }
int find_state_id(const EST_StrVector &words) const;
int find_state_id(const EST_IVector &words) const;
int find_next_state_id(int state, int word) const;
// fast versions for common N
//const double probability(const EST_String w1);
//const double probability(const EST_String w1,const EST_String w2);
//const double probability(const EST_String w1,const EST_String w2,
//const EST_String w2);
// reverse - probability of words[0..order-2] given word[order-1]
double reverse_probability(const EST_StrVector &words,
bool force=false) const;
double reverse_probability(const EST_IVector &words,
bool force=false) const;
// predict, where words has 'order' elements and the last one is "" or NULL
const EST_DiscreteProbDistribution &prob_dist(const EST_StrVector &words) const;
const EST_DiscreteProbDistribution &prob_dist(const EST_IVector &words) const;
const EST_DiscreteProbDistribution &prob_dist(int state) const;
// bool stats(const EST_String filename,
// double &raw_entropy, double &count,
// double &entropy, double &perplexity,
// const EST_String &prev = SENTENCE_START_MARKER,
// const EST_String &prev_prev = SENTENCE_END_MARKER,
// const EST_String &last = SENTENCE_END_MARKER,
// const EST_String &input_format = "") const;
void fill_window_start(EST_IVector &window,
const EST_String &prev,
const EST_String &prev_prev) const;
void fill_window_start(EST_StrVector &window,
const EST_String &prev,
const EST_String &prev_prev) const;
// why anybody would want to do this ....
//EST_Ngrammar &operator =(const EST_Ngrammar &a);
bool ngram_exists(const EST_StrVector &words) const;
bool ngram_exists(const EST_StrVector &words, const double threshold) const;
const double get_backoff_weight(const EST_StrVector &words) const;
bool set_backoff_weight(const EST_StrVector &words, const double w);
void print_freqs(ostream &os,double floor=0.0);
// i/o functions
// -------------
friend ostream& operator<<(ostream& s, EST_Ngrammar &n);
friend EST_read_status load_ngram_htk_ascii(const EST_String filename,
EST_Ngrammar &n);
friend EST_read_status load_ngram_htk_binary(const EST_String filename,
EST_Ngrammar &n);
friend EST_read_status load_ngram_arpa(const EST_String filename,
EST_Ngrammar &n,
const EST_StrList &vocab);
friend EST_read_status load_ngram_cstr_ascii(const EST_String filename,
EST_Ngrammar &n);
friend EST_read_status load_ngram_cstr_bin(const EST_String filename,
EST_Ngrammar &n);
friend EST_write_status save_ngram_htk_ascii_sub(const EST_String &word,
ostream *ost,
EST_Ngrammar &n,
double floor);
friend EST_write_status save_ngram_htk_ascii(const EST_String filename,
EST_Ngrammar &n,
double floor);
//friend EST_write_status save_ngram_htk_binary(const EST_String filename,
// EST_Ngrammar &n);
friend EST_write_status save_ngram_cstr_ascii(const EST_String filename,
EST_Ngrammar &n,
const bool trace,
double floor);
friend EST_write_status save_ngram_cstr_bin(const EST_String filename,
EST_Ngrammar &n,
const bool trace,
double floor);
friend EST_write_status save_ngram_arpa(const EST_String filename,
EST_Ngrammar &n);
friend EST_write_status save_ngram_arpa_sub(ostream *ost,
EST_Ngrammar &n,
const EST_StrVector &words);
friend EST_write_status save_ngram_wfst(const EST_String filename,
EST_Ngrammar &n);
// Auxiliary functions
// smoothing
friend void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order);
friend void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order);
friend bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount);
friend void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
const double default_discount);
friend void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
EST_Ngrammar &ngram);
friend int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
EST_Ngrammar &ngram, int smooth_thresh);
// frequencies below mincount get backed off
// frequencies above maxcount are not smoothed(discounted)
bool compute_backoff_weights(const int mincount=1,
const int maxcount=10);
bool merge(EST_Ngrammar &n,float weight);
friend class EST_BackoffNgrammar;
};
void Ngram_freqsmooth(EST_Ngrammar &ngram,
int smooth_thresh1,
int smooth_thresh2);
// utils
void slide(EST_IVector &i, const int l);
void slide(EST_StrVector &i, const int l);
bool test_stats(EST_Ngrammar &ngram,
const EST_String &filename,
double &raw_entropy,
double &count,
double &entropy,
double &perplexity,
const EST_String &input_format,
const EST_String &prev = SENTENCE_START_MARKER,
const EST_String &prev_prev = SENTENCE_END_MARKER,
const EST_String &last = SENTENCE_END_MARKER);
VAL_REGISTER_CLASS_DCLS(ngrammar,EST_Ngrammar)
#endif // __EST_NGRAMMAR_H__