305 lines
11 KiB
Scheme
305 lines
11 KiB
Scheme
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; ;;
|
|
;;; Centre for Speech Technology Research ;;
|
|
;;; University of Edinburgh, UK ;;
|
|
;;; Copyright (c) 1996,1997 ;;
|
|
;;; All Rights Reserved. ;;
|
|
;;; ;;
|
|
;;; Permission is hereby granted, free of charge, to use and distribute ;;
|
|
;;; this software and its documentation without restriction, including ;;
|
|
;;; without limitation the rights to use, copy, modify, merge, publish, ;;
|
|
;;; distribute, sublicense, and/or sell copies of this work, and to ;;
|
|
;;; permit persons to whom this work is furnished to do so, subject to ;;
|
|
;;; the following conditions: ;;
|
|
;;; 1. The code must retain the above copyright notice, this list of ;;
|
|
;;; conditions and the following disclaimer. ;;
|
|
;;; 2. Any modifications must be clearly marked as such. ;;
|
|
;;; 3. Original authors' names are not deleted. ;;
|
|
;;; 4. The authors' names are not used to endorse or promote products ;;
|
|
;;; derived from this software without specific prior written ;;
|
|
;;; permission. ;;
|
|
;;; ;;
|
|
;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;;
|
|
;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;;
|
|
;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;;
|
|
;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;;
|
|
;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;;
|
|
;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;;
|
|
;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;;
|
|
;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;;
|
|
;;; THIS SOFTWARE. ;;
|
|
;;; ;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;
|
|
;;; Various tts functions and hooks
|
|
|
|
;;; Once the utterance is built these functions synth and play it
|
|
(defvar tts_hooks (list utt.synth utt.play)
|
|
"tts_hooks
|
|
Function or list of functions to be called during text to speech.
|
|
The function tts_file, chunks data into Utterances of type Token and
|
|
applies this hook to the utterance. This typically contains the utt.synth
|
|
function and utt.play. [see TTS]")
|
|
|
|
;;; This is used to define utterance breaks in tts on files
|
|
(defvar eou_tree
|
|
'((lisp_max_num_tokens > 200)
|
|
((1))
|
|
((n.whitespace matches ".*\n.*\n\\(.\\|\n\\)*");; significant break (2 nls)
|
|
((1))
|
|
((name matches "--+")
|
|
((1))
|
|
((punc matches ".*[\\?:!;].*")
|
|
((1))
|
|
((punc matches ".*\\..*")
|
|
((punc matches "..+");; longer punctuation string
|
|
((punc matches "\\..*,") ;; for U.S.S.R., like tokens
|
|
((0))
|
|
((1)))
|
|
;; This is to distinguish abbreviations vs periods
|
|
;; These are heuristics
|
|
((name matches "\\(.*\\..*\\|[A-Z][A-Za-z]?[A-Za-z]?\\|etc\\)");; an abbreviation
|
|
((n.whitespace is " ")
|
|
((0));; if abbrev single space isn't enough for break
|
|
((n.name matches "[A-Z].*")
|
|
((1))
|
|
((0))))
|
|
((n.whitespace is " ");; if it doesn't look like an abbreviation
|
|
((n.name matches "[A-Z].*");; single space and non-cap is no break
|
|
((1))
|
|
((0)))
|
|
((1)))))
|
|
((0)))))))
|
|
"eou_tree
|
|
End of utterance tree. A decision tree used to determine if the given
|
|
token marks the end of an utterance. It may look one token ahead to
|
|
do this. [see Utterance chunking]")
|
|
|
|
(define (max_num_tokens x)
|
|
"(num_tokens x)
|
|
This is probably controversial, but its good to have a maximum number
|
|
of tokens in an utterance. You really dont want to wait on very long
|
|
utterances, some utts can be thousands of words long, these maybe
|
|
shouldn't be spoken, but we do have to deal with them."
|
|
(let ((c 1) (y x))
|
|
(while y
|
|
(set! c (+ 1 c))
|
|
(set! y (item.prev y)))
|
|
c))
|
|
|
|
;;; The program used to parse stml files
|
|
;;; Needs version 1.0 to allow -D option to work
|
|
(defvar sgml_parse_progname "nsgmls-1.0"
|
|
"sgml_parse_progname
|
|
The name of the program to use to parse SGML files. Typically this is
|
|
nsgml-1.0 from the sp SGML package. [see XML/SGML requirements]")
|
|
|
|
;;; When PHRASE elements are specified in an utterance in STML
|
|
;;; no other method for phrase prediction is to be used, so we
|
|
;;; use the following tree
|
|
(set! stml_phrase_cart_tree
|
|
'((R:Token.parent.pbreak is B)
|
|
((B))
|
|
((n.name is 0)
|
|
((B))
|
|
((NB)))))
|
|
|
|
(define (xxml_synth utt)
|
|
"(xxml_synth UTT)
|
|
This applies the xxml_hooks (mode specific) and tts_hooks to the
|
|
given utterance. This function should be called from xxml element
|
|
definitions that signal an utterance boundary."
|
|
(cond
|
|
((or (not utt)
|
|
(not (utt.relation utt 'Token))) ;; no tokens
|
|
nil)
|
|
(t
|
|
(apply_hooks xxml_hooks utt)
|
|
(apply_hooks tts_hooks utt)
|
|
(set! utt nil) ;; not enough ...
|
|
(gc)
|
|
utt))
|
|
)
|
|
|
|
(define (xxml_attval ATTNAME ATTLIST)
|
|
"(xxml_attval ATTNAME ATTLIST)
|
|
Returns attribute value of ATTNAME in ATTLIST or nil if it doesn't
|
|
exists."
|
|
(cond
|
|
((not ATTLIST)
|
|
nil)
|
|
((string-equal ATTNAME (car (car ATTLIST)))
|
|
(car (cdr (car ATTLIST))))
|
|
(t
|
|
(xxml_attval ATTNAME (cdr ATTLIST)))))
|
|
|
|
(defvar xxml_word_features nil
|
|
"xxml_word_features
|
|
An assoc list of features to be added to the current word when
|
|
in xxml parse mode.")
|
|
|
|
(defvar xxml_token_hooks nil
|
|
"xxml_token_hooks
|
|
Functions to apply to each token.")
|
|
|
|
(defvar xxml_hooks nil
|
|
"xxml_hooks
|
|
Function or list of functions to be applied to an utterance when
|
|
parsed with xxML, before tts_hooks.")
|
|
|
|
(defvar xxml_elements nil
|
|
"xxml_elements
|
|
List of Scheme actions to perform on finding xxML tags.")
|
|
|
|
(defvar xml_dtd_dir libdir
|
|
"xml_dtd_dir
|
|
The directory holding standard DTD form the xml parser.")
|
|
|
|
(set! tts_fnum 1)
|
|
(define (save_tts_output utt)
|
|
(let ((fname (string-append "tts_file_" tts_fnum ".wav")))
|
|
(format stderr "festival: saving waveform in %s\n" fname)
|
|
(utt.save.wave utt fname)
|
|
(set! tts_fnum (+ 1 tts_fnum))
|
|
utt))
|
|
|
|
(define (save_waves_during_tts)
|
|
"(save_waves_during_tts)
|
|
Save each waveform in the current directory in files \"tts_file_XXX.wav\".
|
|
use (save_waves_during_tts_STOP) to stop saving waveforms"
|
|
(if (not (member save_tts_output tts_hooks))
|
|
(set! tts_hooks (append tts_hooks (list save_tts_output))))
|
|
t)
|
|
|
|
(define (save_waves_during_tts_STOP)
|
|
"(save_waves_during_tts_STOP)
|
|
Stop saving waveforms when doing tts."
|
|
(if (member save_tts_output tts_hooks)
|
|
(set! tts_hooks (delq save_tts_output tts_hooks)))
|
|
t)
|
|
|
|
(define (tts file mode)
|
|
"(tts FILE MODE)
|
|
Convert FILE to speech. MODE identifies any special treatment
|
|
necessary for FILE. This is simply a front end to tts_file but
|
|
puts the system in async audio mode first. [see TTS]"
|
|
(audio_mode 'async)
|
|
(if mode
|
|
(tts_file file mode)
|
|
(tts_file file (tts_find_text_mode file auto-text-mode-alist)))
|
|
;; (audio_mode 'sync) ;; Hmm this is probably bad
|
|
)
|
|
|
|
(define (tts_text string mode)
|
|
"(tts_text STRING mode)
|
|
Apply tts on given string. That is, segment it into utterances and
|
|
apply tts_hooks to each utterance. This is naively done by saving the
|
|
string to a file and calling tts_file on that file. This differs from
|
|
SayText which constructs a single utterance for the whole given text."
|
|
(let ((tmpfile (make_tmp_filename))
|
|
(fd))
|
|
(set! fd (fopen tmpfile "wb"))
|
|
(format fd "%s" string)
|
|
(fclose fd)
|
|
(audio_mode 'async)
|
|
(tts_file tmpfile mode)
|
|
(delete-file tmpfile)))
|
|
|
|
(define (save_record_wave utt)
|
|
"Saves the waveform and records its so it can be joined into a
|
|
a single waveform at the end."
|
|
(let ((fn (make_tmp_filename)))
|
|
(utt.save.wave utt fn)
|
|
(set! wavefiles (cons fn wavefiles))
|
|
utt))
|
|
|
|
(define (combine_waves)
|
|
"(combine_waves)
|
|
Join all the waves together into the desired output file
|
|
and delete the intermediate ones."
|
|
(let ((wholeutt (Utterance Text "")))
|
|
(mapcar
|
|
(lambda (d)
|
|
(utt.import.wave wholeutt d t)
|
|
(delete-file d))
|
|
(reverse wavefiles))
|
|
wholeutt))
|
|
|
|
(define (tts_textall string mode)
|
|
"(tts_textall STRING MODE)
|
|
Apply tts to STRING. This function is specifically designed for
|
|
use in server mode so a single function call may synthesize the string.
|
|
This function name maybe added to the server safe functions."
|
|
(if (not (string-equal mode "nil"))
|
|
(begin
|
|
;; a mode has been specified so do something different
|
|
(let ((tmpfile (make_tmp_filename))
|
|
(fd))
|
|
(set! fd (fopen tmpfile "wb"))
|
|
(format fd "%s" string)
|
|
(fclose fd)
|
|
(set! tts_hooks (list utt.synth save_record_wave))
|
|
(set! wavefiles nil)
|
|
(tts_file tmpfile mode)
|
|
(delete-file tmpfile)
|
|
(utt.send.wave.client (combine_waves))
|
|
))
|
|
;; Simple fundamental mode
|
|
(utt.send.wave.client
|
|
(utt.synth
|
|
(eval (list 'Utterance 'Text string))))))
|
|
|
|
;; Function to interface with app_festival for asterisk
|
|
;; See http://www.asterisk.org
|
|
(define (tts_textasterisk string mode)
|
|
"(tts_textasterisk STRING MODE)
|
|
Apply tts to STRING. This function is specifically designed for
|
|
use in server mode so a single function call may synthesize the string.
|
|
This function name may be added to the server safe functions."
|
|
(utt.send.wave.asterisk
|
|
(utt.synth
|
|
(eval (list 'Utterance 'Text string)))))
|
|
|
|
|
|
|
|
(define (tts_return_to_client)
|
|
"(tts_return_to_client)
|
|
This function is called by clients who wish to return waveforms of
|
|
their text samples asynchronously. This replaces utt.play in tts_hooks
|
|
with utt.send.wave.client."
|
|
(if (not (member utt.send.wave.client tts_hooks))
|
|
(set! tts_hooks
|
|
(append (delq utt.play tts_hooks)
|
|
(list utt.send.wave.client)))))
|
|
|
|
(defvar tts_text_modes nil
|
|
"tts_text_modes
|
|
An a-list of text modes data for file type specific tts functions.
|
|
See the manual for an example. [see Text modes]")
|
|
|
|
(define (tts_find_text_mode file alist)
|
|
"(find_text_mode FILE ALIST)
|
|
Search through ALIST for one that matches FILE. Returns nil if
|
|
nothing macthes."
|
|
(cond
|
|
((null alist) nil) ;; can't find a match
|
|
((string-matches file (string-append ".*" (car (car alist)) ".*"))
|
|
(cdr (car alist)))
|
|
(t
|
|
(tts_find_text_mode file (cdr alist)))))
|
|
|
|
(defvar auto-text-mode-alist
|
|
(list
|
|
(cons "\\.sable$" 'sable)
|
|
(cons "\\.ogi" 'ogimarkup)
|
|
(cons "\\.email" 'email)
|
|
(cons "" 'fundamental)
|
|
)
|
|
"auto-text-mode-alist
|
|
Following Emacs' auto-mode-alist thios provides a mechanism for auto
|
|
selecting a TTS text mode based on the filename being analyzed. Its
|
|
format is exactly the same as Emacs in that it consists of an alist of
|
|
dotted pairs of regular expression and text mode name.")
|
|
|
|
(provide 'tts)
|