171 lines
5.9 KiB
Scheme
171 lines
5.9 KiB
Scheme
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; ;;
|
|
;;; Centre for Speech Technology Research ;;
|
|
;;; University of Edinburgh, UK ;;
|
|
;;; Copyright (c) 1997 ;;
|
|
;;; All Rights Reserved. ;;
|
|
;;; ;;
|
|
;;; Permission is hereby granted, free of charge, to use and distribute ;;
|
|
;;; this software and its documentation without restriction, including ;;
|
|
;;; without limitation the rights to use, copy, modify, merge, publish, ;;
|
|
;;; distribute, sublicense, and/or sell copies of this work, and to ;;
|
|
;;; permit persons to whom this work is furnished to do so, subject to ;;
|
|
;;; the following conditions: ;;
|
|
;;; 1. The code must retain the above copyright notice, this list of ;;
|
|
;;; conditions and the following disclaimer. ;;
|
|
;;; 2. Any modifications must be clearly marked as such. ;;
|
|
;;; 3. Original authors' names are not deleted. ;;
|
|
;;; 4. The authors' names are not used to endorse or promote products ;;
|
|
;;; derived from this software without specific prior written ;;
|
|
;;; permission. ;;
|
|
;;; ;;
|
|
;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;;
|
|
;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;;
|
|
;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;;
|
|
;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;;
|
|
;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;;
|
|
;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;;
|
|
;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;;
|
|
;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;;
|
|
;;; THIS SOFTWARE. ;;
|
|
;;; ;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Author: Alan W Black
|
|
;;; Date: December 1997
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;
|
|
;;; THIS IS EXPERIMENTAL AND DOES *NOT* WORK
|
|
;;;
|
|
;;;
|
|
;;; An English morpho-syntax finite-state grammar
|
|
;;; This is used for morphological decomposition of unknown words
|
|
;;; specifically (only) words that are not found in the lexicon.
|
|
;;; This idea is that when an unknown word is found an attempt is made
|
|
;;; to see if it contains any well known morphological inflections or
|
|
;;; derivations, if so a better use of LTS can be made on the root, of
|
|
;;; none are found this
|
|
;;;
|
|
;;;
|
|
;;; Based on "Analysis of Unknown Words through Morphological
|
|
;;; Decomposition", Black, van de Plassche, Willians, European ACL 91.
|
|
;;; with the anyword matcher from a question by Lauri Karttunen after
|
|
;;; the talk.
|
|
;;;
|
|
;;; The suffixes and finite-state morph-syntax grammar is based
|
|
;;; (very roughly) on the rules in "Computational Morphology"
|
|
;;; Ritchie et al. MIT Press 1992.
|
|
;;;
|
|
;;; Can be compiled with
|
|
;;; wfst_build -type rg -o engmorphsyn.wfst -detmin engmorphsyn.scm
|
|
;;;
|
|
;;; The result can be combined with the morphographemic rules
|
|
;;; with
|
|
;;; wfst_build -type compose engmorph.wfst engmorphsyn.wfst -detmin -o engstemmer.wfst
|
|
;;;
|
|
;;; echo "# b o x e/+ s #" | wfst_run -wfst engstemmer.wfst -recog
|
|
;;; state 0 #/# -> 1
|
|
;;; state 1 b/b -> 3
|
|
;;; state 3 o/o -> 17
|
|
;;; state 17 x/x -> 14
|
|
;;; state 14 e/+ -> 36
|
|
;;; state 36 s/s -> 34
|
|
;;; state 34 #/# -> 16
|
|
;;; OK.
|
|
;;; echo "# b o x e s #" | wfst_run -wfst engstemmer.wfst -recog
|
|
;;; state 0 #/# -> 1
|
|
;;; state 1 b/b -> 3
|
|
;;; state 3 o/o -> 17
|
|
;;; state 17 x/x -> 14
|
|
;;; state 14 e/e -> 22
|
|
;;; state 22 s/s -> -1
|
|
|
|
(RegularGrammar
|
|
engsuffixmorphosyntax
|
|
;; Sets
|
|
(
|
|
(V a e i o u y)
|
|
(C b c d f g h j k l m n p q r s t v w x y z)
|
|
)
|
|
;; Rules
|
|
|
|
(
|
|
;; A word *must* have a suffix to be recognized
|
|
(Word -> # Syls Suffix )
|
|
(Word -> # Syls End )
|
|
|
|
;; This matches any string of characters that contains at least one vowel
|
|
(Syls -> Syl Syls )
|
|
(Syls -> Syl )
|
|
(Syl -> Cs V Cs )
|
|
(Cs -> C Cs )
|
|
(Cs -> )
|
|
|
|
(Suffix -> VerbSuffix )
|
|
(Suffix -> NounSuffix )
|
|
(Suffix -> AdjSuffix )
|
|
(VerbSuffix -> VerbFinal End )
|
|
(VerbSuffix -> VerbtoNoun NounSuffix )
|
|
(VerbSuffix -> VerbtoNoun End )
|
|
(VerbSuffix -> VerbtoAdj AdjSuffix )
|
|
(VerbSuffix -> VerbtoAdj End )
|
|
(NounSuffix -> NounFinal End )
|
|
(NounSuffix -> NountoNoun NounSuffix )
|
|
(NounSuffix -> NountoNoun End )
|
|
(NounSuffix -> NountoAdj AdjSuffix )
|
|
(NounSuffix -> NountoAdj End )
|
|
(NounSuffix -> NountoVerb VerbSuffix )
|
|
(NounSuffix -> NountoVerb End )
|
|
(AdjSuffix -> AdjFinal End )
|
|
(AdjSuffix -> AdjtoAdj AdjSuffix)
|
|
(AdjSuffix -> AdjtoAdj End)
|
|
(AdjSuffix -> AdjtoAdv End) ;; isn't any Adv to anything
|
|
|
|
(End -> # ) ;; word boundary symbol *always* present
|
|
|
|
(VerbFinal -> + e d)
|
|
(VerbFinal -> + i n g)
|
|
(VerbFinal -> + s)
|
|
|
|
(VerbtoNoun -> + e r)
|
|
(VerbtoNoun -> + e s s)
|
|
(VerbtoNoun -> + a t i o n)
|
|
(VerbtoNoun -> + i n g)
|
|
(VerbtoNoun -> + m e n t)
|
|
|
|
(VerbtoAdj -> + a b l e)
|
|
|
|
(NounFinal -> + s)
|
|
|
|
(NountoNoun -> + i s m)
|
|
(NountoNoun -> + i s t)
|
|
(NountoNoun -> + s h i p)
|
|
|
|
(NountoAdj -> + l i k e)
|
|
(NountoAdj -> + l e s s)
|
|
(NountoAdj -> + i s h)
|
|
(NountoAdj -> + o u s)
|
|
|
|
(NountoVerb -> + i f y)
|
|
(NountoVerb -> + i s e)
|
|
(NountoVerb -> + i z e)
|
|
|
|
(AdjFinal -> + e r)
|
|
(AdjFinal -> + e s t)
|
|
|
|
(AdjtoAdj -> + i s h)
|
|
(AdjtoAdv -> + l y)
|
|
(AdjtoNoun -> + n e s s)
|
|
(AdjtoVerb -> + i s e)
|
|
(AdjtoVerb -> + i z e)
|
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|