107 lines
4.5 KiB
C++
107 lines
4.5 KiB
C++
/************************************************************************/
|
|
/* */
|
|
/* Centre for Speech Technology Research */
|
|
/* University of Edinburgh, UK */
|
|
/* Copyright (c) 1996,1997 */
|
|
/* All Rights Reserved. */
|
|
/* */
|
|
/* Permission is hereby granted, free of charge, to use and distribute */
|
|
/* this software and its documentation without restriction, including */
|
|
/* without limitation the rights to use, copy, modify, merge, publish, */
|
|
/* distribute, sublicense, and/or sell copies of this work, and to */
|
|
/* permit persons to whom this work is furnished to do so, subject to */
|
|
/* the following conditions: */
|
|
/* 1. The code must retain the above copyright notice, this list of */
|
|
/* conditions and the following disclaimer. */
|
|
/* 2. Any modifications must be clearly marked as such. */
|
|
/* 3. Original authors' names are not deleted. */
|
|
/* 4. The authors' names are not used to endorse or promote products */
|
|
/* derived from this software without specific prior written */
|
|
/* permission. */
|
|
/* */
|
|
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
|
|
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
|
|
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
|
|
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
|
|
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
|
|
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
|
|
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
|
|
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
|
|
/* THIS SOFTWARE. */
|
|
/* */
|
|
/************************************************************************/
|
|
/* Author: Alan W Black */
|
|
/* Date: May 1997 */
|
|
/************************************************************************/
|
|
/* */
|
|
/* Example of reading a file using the tokenizer */
|
|
/* */
|
|
/************************************************************************/
|
|
|
|
#include <cstdlib>
|
|
#include "EST_Token.h"
|
|
|
|
#if defined(DATAC)
|
|
# define __STRINGIZE(X) #X
|
|
# define DATA __STRINGIZE(DATAC)
|
|
#endif
|
|
|
|
int main(int argc,char **argv)
|
|
{
|
|
// Simple program to read all the tokens in the named file
|
|
// a print a summary of them
|
|
EST_TokenStream ts;
|
|
int tokens, alices, quotes;
|
|
EST_Token t;
|
|
EST_String fname;
|
|
|
|
if (argc > 2)
|
|
{
|
|
cerr << argv[0] << ": wrong number of arguments\n";
|
|
exit(-1);
|
|
}
|
|
else if (argc == 2)
|
|
fname = argv[1];
|
|
else
|
|
fname = DATA "/alice";
|
|
|
|
if (ts.open(fname) == -1)
|
|
{
|
|
cerr << argv[0] << ": can't open input file \"" << argv[1] <<
|
|
"\"\n";
|
|
exit(-1);
|
|
}
|
|
|
|
// Control of whitespace characters, single character symbols,
|
|
// pre and post punctuation may be set here.
|
|
|
|
// The defaults are standard whitespace, and nothing for the rest
|
|
// (this is like awk's basic tokenizer). For language analysis
|
|
// you'll probably want to modify the punctuation
|
|
// \173 is '{', it is inserted by number because of a doc++ problem.
|
|
|
|
ts.set_PrePunctuationSymbols("\173[(\"'");
|
|
ts.set_PunctuationSymbols(EST_Token_Default_PunctuationSymbols);
|
|
|
|
// Note you may set quotes so quoted tokens are read as single
|
|
// tokens (a la C)
|
|
|
|
for (tokens=quotes=alices=0; !ts.eof(); tokens++)
|
|
{
|
|
t = ts.get();
|
|
if (t == "Alice")
|
|
alices++;
|
|
if (t.prepunctuation().contains("\""))
|
|
quotes++;
|
|
}
|
|
|
|
printf("Input file contains:\n");
|
|
printf(" %5d tokens\n",tokens);
|
|
printf(" %5d tokens preceeded by double quotes\n",quotes);
|
|
printf(" %5d occurrences of Alice\n",alices);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|