festival/lib/tokenpos.scm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;                                                                       ;;
;;;                Centre for Speech Technology Research                  ;;
;;;                     University of Edinburgh, UK                       ;;
;;;                       Copyright (c) 1996,1997                         ;;
;;;                        All Rights Reserved.                           ;;
;;;                                                                       ;;
;;;  Permission is hereby granted, free of charge, to use and distribute  ;;
;;;  this software and its documentation without restriction, including   ;;
;;;  without limitation the rights to use, copy, modify, merge, publish,  ;;
;;;  distribute, sublicense, and/or sell copies of this work, and to      ;;
;;;  permit persons to whom this work is furnished to do so, subject to   ;;
;;;  the following conditions:                                            ;;
;;;   1. The code must retain the above copyright notice, this list of    ;;
;;;      conditions and the following disclaimer.                         ;;
;;;   2. Any modifications must be clearly marked as such.                ;;
;;;   3. Original authors' names are not deleted.                         ;;
;;;   4. The authors' names are not used to endorse or promote products   ;;
;;;      derived from this software without specific prior written        ;;
;;;      permission.                                                      ;;
;;;                                                                       ;;
;;;  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        ;;
;;;  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ;;
;;;  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ;;
;;;  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     ;;
;;;  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ;;
;;;  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ;;
;;;  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ;;
;;;  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ;;
;;;  THIS SOFTWARE.                                                       ;;
;;;                                                                       ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;  Functions used in identifying token types.
;;;

(defvar token_most_common
'(
sym numeric month to day in the of on and writes a years from
for jst at million by is was gmt page he that than more since as when
with but after about or his i has it date no died number bst who miles
university some people an only w year have ago were are pages up days
months hours minutes through out had which least hi last now ft this
all one its there between cents until over will before past they
nearly times tim message so lbs just if age we during she billion then
other be time new her first states not you members under would many
says degrees two next fax week while bush been around including back
campaign american within publisher flight points even early later
world countries every edt can president most could their what them
former began women killed another also received long americans pounds
do dear said km made into did dead war tel still old x took total men
like f am less c well late down weeks end chapter among place house
away him election death almost students state soviet where version
summer man s nation because washington top though m id est these spent
seats gnu estimated those lost ian high each copies children acres
tons son per my found won off seconds power nations federal born
presidential much city begin p name different whose three home hello)

"token_most_common
A list of (English) words which were found to be most common in
an text database and are used as discriminators in token analysis.")

(define (token_pos_guess sc)
"(tok_pos sc)
Returns a general pos for sc's name.
  numeric   All digits
  number    float or comma'd numeric
  sym       Contains at least one non alphanumeric
  month     has month name (or abbrev)
  day       has day name (or abbrev)
  rootname  else downcased alphabetic.
Note this can be used to find token_pos but isn't used directly as
its not disciminatory enough."
  (let ((name (downcase (item.name sc))))
    (cond
     ((string-matches name "[0-9]+")
      'numeric)
     ((or (string-matches name "[0-9]+\\.[0-9]+")
	  (string-matches name
	     "[0-9][0-9]?[0-9]?,\\([0-9][0-9][0-9],\\)*[0-9][0-9][0-9]"))
      'number)
     ((string-matches name ".*[^A-Za-z0-9].*")
      'sym)
     ((member_string name '(jan january feb february mar march
   			    apr april may jun june
			    jul july aug august sep sept september
			    oct october nov november dec december))
      'month)
     ((member_string name '(sun sunday mon monday tue tues tuesday
			    wed wednesday thu thurs thursday
			    fri friday sat saturday))
      'day)
     ((member_string name token_most_common)
      name)
     (t
      '_other_))))

(define (token_no_starting_quote token)
  "(token_no_starting_quote TOKEN)
Check to see if a single quote (or backquote) appears as prepunctuation
in this token or any previous one in this utterance.  This is used to
disambiguate ending single quote as possessive or end quote."
  (cond
   ((null token)
    t)
   ((string-matches (item.feat token "prepunctuation") "[`']")
    nil)
   (t
    (token_no_starting_quote (item.relation.prev token "Token")))))

(define (token_zerostart sc)
"(zerostart sc)
Returns, 1 if first char of sc's name is 0, 0 otherwise."
  (if (string-matches (item.name sc) "^0.*")
      "1"
      "0"))

(define (tok_roman_to_numstring roman)
  "(tok_roman_to_numstring ROMAN)
Takes a string of roman numerals and converts it to a number and
then returns the printed string of that.  Only deals with numbers up to 50."
  (let ((val 0) (chars (symbolexplode roman)))
    (while chars
     (cond
      ((equal? (car chars) 'X)
       (set! val (+ 10 val)))
      ((equal? (car chars) 'V)
       (set! val (+ 5 val)))
      ((equal? (car chars) 'I)
       (cond
	((equal? (car (cdr chars)) 'V)
	 (set! val (+ 4 val))
	 (set! chars (cdr chars)))
	((equal? (car (cdr chars)) 'X)
	 (set! val (+ 9 val))
	 (set! chars (cdr chars)))
	(t
	 (set! val (+ 1 val))))))
     (set! chars (cdr chars)))
    (format nil "%d" val)))

(define (num_digits sc)
"(num_digits SC)
Returns number of digits (actually chars) is SC's name."
  (string-length (format nil "%s" (item.name sc))))

(define (month_range sc)
"(month_range SC)
1 if SC's name is > 0 and < 32, 0 otherwise."
  (let ((val (parse-number (item.name sc))))
    (if (and (> val 0) (< val 32))
	"1"
	"0")))

(define (remove_leading_zeros name)
  "(remove_leading_zeros name)
Remove leading zeros from given string."
  (let ((nname name))
    (while (string-matches nname "^0..*")
	   (set! nname (string-after nname "0")))
    nname))

(define (token_money_expand type)
"(token_money_expand type)
Convert shortened form of money identifier to words if of a known type."
  (cond
   ((string-equal type "HK")
    (list "Hong" "Kong"))
   ((string-equal type "C")
    (list "Canadian"))
   ((string-equal type "A")
    (list "Australian"))
   ((< (length type) 4)
    (mapcar
     (lambda (letter)
       (list (list 'name letter)
	     (list 'pos token.letter_pos)))
     (symbolexplode type)))
   (t
    (list type))))

(define (find_month_from_number token string-number)
  "(find_month_from_number token string-number)
Find the textual representation of the month from the given string number"
  (let ((nnum (parse-number string-number)))
    (cond
     ((equal? 1 nnum) (list "January"))
     ((equal? 2 nnum) (list "February"))
     ((equal? 3 nnum) (list "March"))
     ((equal? 4 nnum) (list "April"))
     ((equal? 5 nnum) (list "May"))
     ((equal? 6 nnum) (list "June"))
     ((equal? 7 nnum) (list "July"))
     ((equal? 8 nnum) (list "August"))
     ((equal? 9 nnum) (list "September"))
     ((equal? 10 nnum) (list "October"))
     ((equal? 11 nnum) (list "November"))
     ((equal? 12 nnum) (list "December"))
     (t
      (cons "month"
	    (builtin_english_token_to_words token string-number))))))

(define (tok_allcaps sc)
  "(tok_allcaps sc)
Returns 1 if sc's name is all capitals, 0 otherwise"
  (if (string-matches (item.name sc) "[A-Z]+")
      "1"
      "0"))

(define (tok_section_name sc)
  "(tok_section_name sc)
Returns 1 if sc's name is in list of things that are section/chapter
like."
  (if (member_string
       (downcase (item.name sc))
       '(chapter section part article phrase verse scene act book
		 volume chap sect art vol war fortran saturn
		 trek))
      "1"
      "0"))

(define (tok_string_as_letters name)
  "(tok_string_as_letters NAME)
Return list of letters marked as letter part of speech made
by exploding NAME."
  (mapcar
   (lambda (letter)
     (list (list 'name letter)
	   (list 'pos token.letter_pos)))
   (symbolexplode name)))

(define (tok_rex sc)
  "(tok_rex sc)
Returns 1 if King like title is within 3 tokens before or 2 after."
  (let ((kings '(king queen pope duke tsar emperor shah ceasar
		      duchess tsarina empress baron baroness
		      count countess)))
    (if (or (member_string
	     (downcase (item.feat sc "R:Token.pp.name"))
	     kings)
	    (member_string
	     (downcase (item.feat sc "R:Token.pp.p.name"))
	     kings)
	    (member_string
	     (downcase (item.feat sc "R:Token.n.name"))
	     kings))
	"1"
	"0")))

(define (tok_rex_names sc)
  "(tok_rex sc)
Returns 1 if this is a King-like name."
  (if (and
       (member_string
        (downcase (item.name sc))
        '(louis henry charles philip george edward pius william richard
                ptolemy john paul peter nicholas
                alexander frederick james alfonso ivan napolean leo
                gregory catherine alexandria pierre elizabeth mary))
       (or (string-equal "" (item.feat sc "punc"))
           (string-equal "0" (item.feat sc "punc"))))
      "1"
      "0"))

(provide 'tokenpos)