674 lines
23 KiB
Scheme
674 lines
23 KiB
Scheme
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;
|
|
;;; Festival Singing Mode
|
|
;;;
|
|
;;; Written by Dominic Mazzoni
|
|
;;; Carnegie Mellon University
|
|
;;; 11-752 - "Speech: Phonetics, Prosody, Perception and Synthesis"
|
|
;;; Spring 2001
|
|
;;;
|
|
;;; Extended by Milan Zamazal <pdm@brailcom.org>, 2006:
|
|
;;; - Slur support.
|
|
;;; - Czech support.
|
|
;;; - Some cleanup.
|
|
;;; - Print debugging information only when singing-debug is true.
|
|
;;;
|
|
;;; This code is public domain; anyone may use it freely.
|
|
;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(require_module 'rxp)
|
|
|
|
(xml_register_id "-//SINGING//DTD SINGING mark up//EN"
|
|
(path-append xml_dtd_dir "Singing.v0_1.dtd")
|
|
)
|
|
|
|
(xml_register_id "-//SINGING//ENTITIES Added Latin 1 for SINGING//EN"
|
|
(path-append xml_dtd_dir "sable-latin.ent")
|
|
)
|
|
|
|
;; Set this to t to enable debugging messages:
|
|
(defvar singing-debug nil)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;
|
|
;; XML parsing functions
|
|
;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;
|
|
;; singing_xml_targets
|
|
;;
|
|
;; This variable defines the actions that are to be taken when
|
|
;; parsing each of our XML tags: SINGING, PITCH, DURATION, and REST.
|
|
;;
|
|
;; When we get the pitch and duration of each token, we store them
|
|
;; in features of the token. Later our intonation and duration
|
|
;; functions access these features.
|
|
;;
|
|
|
|
(defvar singing_xml_elements
|
|
'(
|
|
("(SINGING" (ATTLIST UTT)
|
|
(set! singing_pitch_att_list nil)
|
|
(set! singing_dur_att_list nil)
|
|
(set! singing_global_time 0.0)
|
|
(set! singing_bpm (get-bpm ATTLIST))
|
|
(set! singing_bps (/ singing_bpm 50.0)) ;; change this back to 60
|
|
nil)
|
|
|
|
(")SINGING" (ATTLIST UTT)
|
|
(xxml_synth UTT) ;; Synthesize the remaining tokens
|
|
nil)
|
|
|
|
("(PITCH" (ATTLIST UTT)
|
|
(set! singing_pitch_att_list ATTLIST)
|
|
UTT)
|
|
|
|
(")PITCH" (ATTLIST UTT)
|
|
(let ((freq (get-freqs singing_pitch_att_list)))
|
|
(if singing-debug
|
|
(begin
|
|
(print "freqs")
|
|
(print freq)))
|
|
(singing-append-feature! UTT 'freq freq))
|
|
UTT)
|
|
|
|
("(DURATION" (ATTLIST UTT)
|
|
(set! singing_dur_att_list ATTLIST)
|
|
UTT)
|
|
|
|
(")DURATION" (ATTLIST UTT)
|
|
(let ((dur (get-durs singing_dur_att_list)))
|
|
(if singing-debug
|
|
(begin
|
|
(print "durs")
|
|
(print dur)))
|
|
(singing-append-feature! UTT 'dur dur))
|
|
UTT)
|
|
|
|
("(REST" (ATTLIST UTT)
|
|
(let ((dur (get-durs ATTLIST)))
|
|
(if singing-debug
|
|
(begin
|
|
(print "rest durs")
|
|
(print dur)))
|
|
(singing-append-feature! UTT 'rest (caar dur)))
|
|
UTT)
|
|
))
|
|
|
|
;;
|
|
;; get-bpm
|
|
;;
|
|
;; Given the attribute list of a SINGING tag, returns the beats
|
|
;; per minute of the song from the BPM parameter.
|
|
;;
|
|
|
|
(define (get-bpm atts)
|
|
(parse-number (car (car (cdr (assoc 'BPM atts))))))
|
|
|
|
;;
|
|
;; get-durs
|
|
;;
|
|
;; Given the attribute list of a DURATION tag, returns a list of
|
|
;; durations in seconds for the syllables of the word enclosed by
|
|
;; this tag.
|
|
;;
|
|
;; It first looks for a BEATS parameter, and converts these to
|
|
;; seconds using BPM, which was set in the SINGING tag. If this
|
|
;; is not present, it looks for the SECONDS parameter.
|
|
;;
|
|
|
|
(define (get-durs atts)
|
|
(let ((seconds (car (car (cdr (assoc 'SECONDS atts)))))
|
|
(beats (car (car (cdr (assoc 'BEATS atts))))))
|
|
(if (equal? beats 'X)
|
|
(mapcar (lambda (lst) (mapcar parse-number lst))
|
|
(string->list seconds))
|
|
(mapcar (lambda (lst)
|
|
(mapcar (lambda (x) (/ (parse-number x) singing_bps)) lst))
|
|
(string->list beats)))))
|
|
|
|
;;
|
|
;; get-freqs
|
|
;;
|
|
;; Given the attribute list of a PITCH tag, returns a list of
|
|
;; frequencies in Hertz for the syllables of the word enclosed by
|
|
;; this tag.
|
|
;;
|
|
;; It first looks for a NOTE parameter, which can contain a MIDI
|
|
;; note of the form "C4", "D#3", or "Ab6", and if this is not
|
|
;; present it looks for the FREQ parameter.
|
|
;;
|
|
|
|
(define (get-freqs atts)
|
|
(let ((freqs (car (car (cdr (assoc 'FREQ atts)))))
|
|
(notes (car (car (cdr (assoc 'NOTE atts))))))
|
|
(if (equal? notes 'X)
|
|
(mapcar (lambda (lst) (mapcar parse-number lst))
|
|
(string->list freqs))
|
|
(mapcar (lambda (lst) (mapcar note->freq lst))
|
|
(string->list notes)))))
|
|
|
|
;;
|
|
;; note->freq
|
|
;;
|
|
;; Converts a string representing a MIDI note such as "C4" and
|
|
;; turns it into a frequency. We use the convention that
|
|
;; A5=440 (some call this note A3).
|
|
;;
|
|
|
|
(define (note->freq note)
|
|
(if singing-debug
|
|
(format t "note is %l\n" note))
|
|
(set! note (format nil "%s" note))
|
|
(if singing-debug
|
|
(print_string note))
|
|
(let (l octave notename midinote thefreq)
|
|
(set! l (string-length note))
|
|
(set! octave (substring note (- l 1) 1))
|
|
(set! notename (substring note 0 (- l 1)))
|
|
(set! midinote (+ (* 12 (parse-number octave))
|
|
(notename->midioffset notename)))
|
|
(set! thefreq (midinote->freq midinote))
|
|
(if singing-debug
|
|
(format t "note %s freq %f\n" note thefreq))
|
|
thefreq))
|
|
|
|
;;
|
|
;; midinote->freq
|
|
;;
|
|
;; Converts a MIDI note number (1 - 127) into a frequency. We use
|
|
;; the convention that 69 = "A5" =440 Hz.
|
|
;;
|
|
|
|
(define (midinote->freq midinote)
|
|
(* 440.0 (pow 2.0 (/ (- midinote 69) 12))))
|
|
|
|
;;
|
|
;; notename->midioffset
|
|
;;
|
|
;; Utility function that looks up the name of a note like "F#" and
|
|
;; returns its offset from C.
|
|
;;
|
|
|
|
(define (notename->midioffset notename)
|
|
(parse-number (car (cdr (assoc_string notename note_names)))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;
|
|
;; Pitch modification functions
|
|
;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;
|
|
;; singing_f0_targets
|
|
;;
|
|
;; This function replaces the normal intonation function used in
|
|
;; festival. For each syllable, it extracts the frequency that
|
|
;; was calculated from the XML tags and stored in the token this
|
|
;; syllable comes from, and sets this frequency as both the start
|
|
;; and end f0 target. Really straightforward!
|
|
;;
|
|
|
|
(defvar singing-last-f0 nil)
|
|
(define (singing_f0_targets utt syl)
|
|
"(singing_f0_targets utt syl)"
|
|
(let ((start (item.feat syl 'syllable_start))
|
|
(end (item.feat syl 'syllable_end))
|
|
(freqs (mapcar parse-number (syl->freq syl)))
|
|
(durs (syl->durations syl)))
|
|
(let ((total-durs (apply + durs))
|
|
(total-time (- end start))
|
|
(time start)
|
|
(prev-segment (item.prev (item.relation (item.daughter1 (item.relation syl 'SylStructure)) 'Segment)))
|
|
(last-f0 singing-last-f0))
|
|
(if freqs
|
|
(begin
|
|
(set! singing-last-f0 (car (last freqs)))
|
|
(append (if (and last-f0
|
|
prev-segment
|
|
(item.prev prev-segment)
|
|
(string-equal (item.feat prev-segment 'name)
|
|
(car (car (cdr (car (PhoneSet.description '(silences))))))))
|
|
(let ((s (item.feat prev-segment "p.end"))
|
|
(e (item.feat prev-segment "end")))
|
|
(list (list (+ s (* (- e s) 0.8)) last-f0)
|
|
(list (+ s (* (- e s) 0.9)) (car freqs)))))
|
|
(apply append
|
|
(mapcar (lambda (d f)
|
|
(let ((range (* (/ d total-durs) total-time))
|
|
(old-time time))
|
|
(set! time (+ time range))
|
|
(let ((range-fraction (* 0.1 range)))
|
|
(list (list (+ old-time range-fraction) f)
|
|
(list (- time range-fraction) f)))))
|
|
durs freqs))))))))
|
|
|
|
;;
|
|
;; syl->freq
|
|
;;
|
|
;; Given a syllable, looks up the frequency in its token. The token
|
|
;; stores a list of all of the frequencies associated with its
|
|
;; syllables, so this syllable grabs the frequency out of the list
|
|
;; corresponding to its index within the word. (This assumes that
|
|
;; a frequency was given for each syllable, and that a token
|
|
;; corresponds directly to a word. Singing-mode is not guaranteed
|
|
;; to work at all if either of these things are not true.)
|
|
;;
|
|
|
|
(define (syl->freq syl)
|
|
(let ((index (item.feat syl "R:Syllable.pos_in_word"))
|
|
(freqs (singing-feat syl "R:SylStructure.parent.R:Token.parent.freq")))
|
|
(nth index freqs)))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;
|
|
;; Duration modification functions
|
|
;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;
|
|
;; singing_duration_method
|
|
;;
|
|
;; Calculates the duration of each phone in the utterance, in three
|
|
;; passes. Consult the three functions it calls, below, to see what
|
|
;; each one does.
|
|
;;
|
|
|
|
(define (singing_duration_method utt)
|
|
(mapcar singing_adjcons_syllable (utt.relation.items utt 'Syllable))
|
|
(singing_do_initial utt (car (utt.relation.items utt 'Token)))
|
|
(mapcar singing_do_syllable (utt.relation.items utt 'Syllable))
|
|
(mapcar singing_fix_segment (utt.relation.items utt 'Segment))
|
|
utt)
|
|
|
|
;;
|
|
;; singing_adjcons_syllable
|
|
;;
|
|
;; First pass. Looks at the first phone of each syllable and
|
|
;; adjusts the starting time of this syllable such that the
|
|
;; perceived start time of the first phone is at the beginning
|
|
;; of the originally intended start time of the syllable.
|
|
;;
|
|
;; If this is not done, telling it to say the word "ta" at time
|
|
;; 2.0 actually doesn't "sound" like it says the "t" sound until
|
|
;; about 2.1 seconds.
|
|
;;
|
|
;; This function has a little bit of duplicated code from
|
|
;; singing_do_syllable, below - it could be modularized a little
|
|
;; better.
|
|
;;
|
|
|
|
(define (singing_adjcons_syllable syl)
|
|
(let ((totlen (apply + (mapcar (lambda (s)
|
|
(get_avg_duration (item.feat s "name")))
|
|
(item.leafs
|
|
(item.relation syl 'SylStructure)))))
|
|
(syldur (apply + (syl->durations syl)))
|
|
;; figure out the offset of the first phone
|
|
(phone1 (item.daughter1 (item.relation syl 'SylStructure)))
|
|
(prevsyl (item.prev (item.relation syl 'Syllable))))
|
|
(let ((offset (get_duration_offset (item.feat phone1 "name"))))
|
|
(if singing-debug
|
|
(format t "offset: %f\n" offset) )
|
|
(if (< syldur totlen)
|
|
(set! offset (* offset (/ syldur totlen))))
|
|
(if singing-debug
|
|
(format t "Want to adjust syl by %f\n" offset))
|
|
(if prevsyl
|
|
(begin
|
|
(item.set_feat prevsyl 'subtractoffset offset)
|
|
(item.set_feat syl 'addoffset offset))))))
|
|
|
|
;;
|
|
;; singing_do_syllable
|
|
;;
|
|
;; Second pass. For each syllable, adds up the amount of time
|
|
;; that would normally be spent in consonants and vowels, based
|
|
;; on the average durations of these phones. Then, if the
|
|
;; intended length of this syllable is longer than this total,
|
|
;; stretch only the vowels; otherwise shrink all phones
|
|
;; proportionally. This function actually sets the "end" time
|
|
;; of each phone using a global "singing_global_time" variable.
|
|
;;
|
|
;; We also handle rests at this point, which are tagged onto the
|
|
;; end of the previous token.
|
|
;;
|
|
|
|
(defvar singing-max-short-vowel-length 0.11)
|
|
|
|
(define (singing_do_initial utt token)
|
|
(if (equal? (item.name token) "")
|
|
(let ((restlen (car (item.feat token 'rest))))
|
|
(if singing-debug
|
|
(format t "restlen %l\n" restlen))
|
|
(if (> restlen 0)
|
|
(let ((silence (car (car (cdr (assoc 'silences (PhoneSet.description)))))))
|
|
(set! singing_global_time restlen)
|
|
(item.relation.insert (utt.relation.first utt 'Segment) 'Segment
|
|
(list silence (list (list "end" singing_global_time)))
|
|
'before))))))
|
|
|
|
(define (singing_do_syllable syl)
|
|
(let ((conslen 0.0)
|
|
(vowlen 0.0)
|
|
(segments (item.leafs (item.relation syl 'SylStructure))))
|
|
;; if there are no vowels, turn a middle consonant into a vowel;
|
|
;; hopefully this works well for languages where syllables may be
|
|
;; created by some consonants too
|
|
(let ((segments* segments)
|
|
(vowel-found nil))
|
|
(while (and segments* (not vowel-found))
|
|
(if (equal? "+" (item.feat (car segments*) "ph_vc"))
|
|
(set! vowel-found t)
|
|
(set! segments* (cdr segments*))))
|
|
(if (not vowel-found)
|
|
(item.set_feat (nth (nint (/ (- (length segments) 1) 2))
|
|
segments)
|
|
"singing-vc" "+")))
|
|
;; sum up the length of all of the vowels and consonants in
|
|
;; this syllable
|
|
(mapcar (lambda (s)
|
|
(let ((slen (get_avg_duration (item.feat s "name"))))
|
|
(if (or (equal? "+" (item.feat s "ph_vc"))
|
|
(equal? "+" (item.feat s "singing-vc")))
|
|
(set! vowlen (+ vowlen slen))
|
|
(set! conslen (+ conslen slen)))))
|
|
segments)
|
|
(let ((totlen (+ conslen vowlen))
|
|
(syldur (apply + (syl->durations syl)))
|
|
(addoffset (item.feat syl 'addoffset))
|
|
(subtractoffset (item.feat syl 'subtractoffset))
|
|
offset)
|
|
(set! offset (- subtractoffset addoffset))
|
|
(if singing-debug
|
|
(format t "Vowlen: %f conslen: %f totlen: %f\n" vowlen conslen totlen))
|
|
(if (< offset (/ syldur 2.0))
|
|
(begin
|
|
(set! syldur (- syldur offset))
|
|
(if singing-debug
|
|
(format t "Offset: %f\n" offset))))
|
|
(if singing-debug
|
|
(format t "Syldur: %f\n" syldur))
|
|
(if (> totlen syldur)
|
|
;; if the total length of the average durations in the syllable is
|
|
;; greater than the total desired duration of the syllable, stretch
|
|
;; the time proportionally for each phone
|
|
(let ((stretch (/ syldur totlen)))
|
|
(mapcar (lambda (s)
|
|
(let ((slen (* stretch (get_avg_duration (item.feat s "name")))))
|
|
(set! singing_global_time (+ slen singing_global_time))
|
|
(item.set_feat s 'end singing_global_time)))
|
|
(item.leafs (item.relation syl 'SylStructure))))
|
|
;; otherwise, stretch the vowels and not the consonants
|
|
(let ((voweltime (- syldur conslen)))
|
|
(let ((vowelstretch (/ voweltime vowlen))
|
|
(phones (mapcar car (car (cdar (PhoneSet.description '(phones)))))))
|
|
(mapcar (lambda (s)
|
|
(let ((slen (get_avg_duration (item.feat s "name"))))
|
|
(if (or (equal? "+" (item.feat s "ph_vc"))
|
|
(equal? "+" (item.feat s "singing-vc")))
|
|
(begin
|
|
(set! slen (* vowelstretch slen))
|
|
;; If the sound is long enough, better results
|
|
;; may be achieved by using longer versions of
|
|
;; the vowels.
|
|
(if (> slen singing-max-short-vowel-length)
|
|
(let ((sname (string-append (item.feat s "name") ":")))
|
|
(if (member_string sname phones)
|
|
(item.set_feat s "name" sname))))))
|
|
(set! singing_global_time (+ slen singing_global_time))
|
|
(item.set_feat s 'end singing_global_time)))
|
|
segments))))))
|
|
(let ((restlen (car (syl->rest syl))))
|
|
(if singing-debug
|
|
(format t "restlen %l\n" restlen))
|
|
(if (> restlen 0)
|
|
(let ((lastseg (item.daughtern (item.relation syl 'SylStructure)))
|
|
(silence (car (car (cdr (assoc 'silences (PhoneSet.description))))))
|
|
(singing_global_time* singing_global_time))
|
|
(let ((seg (item.relation lastseg 'Segment))
|
|
(extra-pause-length 0.00001))
|
|
(set! singing_global_time (+ restlen singing_global_time))
|
|
(item.insert seg (list silence (list (list "end" singing_global_time))) 'after)
|
|
;; insert a very short extra pause to avoid after-effects, especially
|
|
;; after vowels
|
|
(if (and seg
|
|
(equal? (item.feat seg "ph_vc") "+")
|
|
(< extra-pause-length restlen))
|
|
(item.insert seg (list silence (list (list "end" (+ singing_global_time*
|
|
extra-pause-length))))
|
|
'after)))))))
|
|
|
|
;;
|
|
;; singing_fix_segment
|
|
;;
|
|
;; Third pass. Finds any segments (phones) that we didn't catch earlier
|
|
;; (say if they didn't belong to a syllable, like silence) and sets them
|
|
;; to zero duration
|
|
;;
|
|
|
|
(define (singing_fix_segment seg)
|
|
(if (equal? 0.0 (item.feat seg 'end))
|
|
(if (equal? nil (item.prev seg))
|
|
(item.set_feat seg 'end 0.0)
|
|
(item.set_feat seg 'end (item.feat (item.prev seg) 'end)))
|
|
(if singing-debug
|
|
(format t "segment: %s end: %f\n" (item.name seg) (item.feat seg 'end)))))
|
|
|
|
;; returns the duration of a syllable (stored in its token)
|
|
(define (syl->durations syl)
|
|
(let ((index (item.feat syl "R:Syllable.pos_in_word"))
|
|
(durs (singing-feat syl "R:SylStructure.parent.R:Token.parent.dur")))
|
|
(mapcar parse-number (nth index durs))))
|
|
|
|
;; returns the duration of the rest following a syllable
|
|
(define (syl->rest syl)
|
|
(let ((index (item.feat syl "R:Syllable.pos_in_word"))
|
|
(durs (singing-feat syl "R:SylStructure.parent.R:Token.parent.dur"))
|
|
(pauselen (singing-feat syl "R:SylStructure.parent.R:Token.parent.rest")))
|
|
(if (equal? index (- (length durs) 1))
|
|
(list (or pauselen 0.0))
|
|
(list 0.0))))
|
|
|
|
;; get the average duration of a phone
|
|
(define (get_avg_duration phone)
|
|
(let ((pd (assoc_string phone phoneme_durations)))
|
|
(if pd
|
|
(car (cdr pd))
|
|
0.08)))
|
|
|
|
;; get the duration offset of a phone (see the description above)
|
|
(define (get_duration_offset phone)
|
|
(parse-number (car (cdr (assoc_string phone phoneme_offsets*)))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;
|
|
;; Other utility functions
|
|
;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(define (char-quote string)
|
|
(if (member string '("*" "+" "?" "[" "]" "."))
|
|
(string-append "[" string "]")
|
|
string))
|
|
|
|
(define (split-string string separator)
|
|
(if (string-matches string (string-append ".+" (char-quote separator) ".+"))
|
|
(cons (string-before string separator)
|
|
(split-string (string-after string separator) separator))
|
|
;; We have to convert the weird XML attribute value type to string
|
|
(list (string-append string ""))))
|
|
|
|
(define (string->list string)
|
|
(mapcar (lambda (s) (split-string s "+")) (split-string string ",")))
|
|
|
|
(define (singing-append-feature! utt feature value)
|
|
(let ((tokens (utt.relation.items utt 'Token)))
|
|
(if tokens
|
|
;; we have to wrap value into a list to work around a Festival bug
|
|
(item.set_feat (car (last tokens)) feature (list value))
|
|
(begin
|
|
(utt.relation.append utt 'Token '("" ((name "") (whitespace "")
|
|
(prepunctuation "") (punc ""))))
|
|
(item.set_feat (car (last (utt.relation.items utt 'Token))) feature (list value))))))
|
|
|
|
(define (singing-feat item feature)
|
|
(let ((value (item.feat item feature)))
|
|
(if (equal? value 0)
|
|
nil
|
|
(car value))))
|
|
|
|
(define (current-language)
|
|
(cadr (car (assoc 'language (voice.description current-voice)))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;
|
|
;; Initializing and exiting singing mode
|
|
;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;
|
|
;; singing_init_func
|
|
;;
|
|
|
|
(defvar singing_previous_eou_tree nil)
|
|
|
|
(define (singing_init_func)
|
|
"(singing_init_func) - Initialization for Singing mode"
|
|
(if (not (symbol-bound? 'phoneme_durations))
|
|
(set! phoneme_durations '()))
|
|
;; use our intonation function
|
|
(Parameter.set 'Int_Method 'General)
|
|
(Parameter.set 'Int_Target_Method Int_Targets_General)
|
|
(set! int_general_params `((targ_func ,singing_f0_targets)))
|
|
(set! singing-last-f0 nil)
|
|
;; use our duration function
|
|
(Parameter.set 'Duration_Method singing_duration_method)
|
|
;; set phoneme corrections for the current language
|
|
(let ((language 'english
|
|
; (cadr (assoc 'language
|
|
; (cadr (voice.description current-voice))))
|
|
))
|
|
(set! phoneme_offsets* (cdr (assoc language phoneme_offsets))))
|
|
;; avoid splitting to multiple utterances with insertion of unwanted pauses
|
|
(set! singing_previous_eou_tree eou_tree)
|
|
(set! eou_tree nil)
|
|
;; use our xml parsing function
|
|
(set! singing_previous_elements xxml_elements)
|
|
(set! xxml_elements singing_xml_elements))
|
|
|
|
;;
|
|
;; singing_exit_func
|
|
;;
|
|
|
|
(define (singing_exit_func)
|
|
"(singing_exit_func) - Exit function for Singing mode"
|
|
(set! eou_tree singing_previous_eou_tree)
|
|
(set! xxml_elements singing_previous_elements))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;
|
|
;; Data tables
|
|
;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(defvar note_names
|
|
'((C 0)
|
|
(C# 1)
|
|
(Db 1)
|
|
(D 2)
|
|
(D# 3)
|
|
(Eb 3)
|
|
(E 4)
|
|
(E# 5)
|
|
(Fb 4)
|
|
(F 5)
|
|
(F# 6)
|
|
(Gb 6)
|
|
(G 7)
|
|
(G# 8)
|
|
(Ab 8)
|
|
(A 9)
|
|
(A# 10)
|
|
(Bb 10)
|
|
(B 11)
|
|
(B# 12)
|
|
(Cb 11)))
|
|
|
|
;;
|
|
;; The following list contains the offset into each phone that best
|
|
;; represents the perceptual onset of the phone. This is important
|
|
;; to know to get durations right in singing. For example, if the
|
|
;; offset for "t" is .060, and you want to start a "t" sound at
|
|
;; time 2.0 seconds, you should actually start the phone playing
|
|
;; at time 1.940 seconds in order for it to sound like the onset of
|
|
;; the "t" is really right at 2.0.
|
|
;;
|
|
;; These were derived empically by looking at and listening to the
|
|
;; waveforms of each phone for mwm's voice.
|
|
;;
|
|
|
|
(defvar phoneme_offsets
|
|
`((english (t 0.050)
|
|
(T 0.050)
|
|
(d 0.090)
|
|
(D 0.090)
|
|
(p 0.080)
|
|
(b 0.080)
|
|
(k 0.090)
|
|
(g 0.100)
|
|
(9r 0.050) ;; r
|
|
(l 0.030)
|
|
(f 0.050)
|
|
(v 0.050)
|
|
(s 0.040)
|
|
(S 0.040)
|
|
(z 0.040)
|
|
(Z 0.040)
|
|
(n 0.040)
|
|
(N 0.040)
|
|
(m 0.040)
|
|
(j 0.090)
|
|
(E 0.0)
|
|
(> 0.0)
|
|
(>i 0.0)
|
|
(aI 0.0)
|
|
(& 0.0)
|
|
(3r 0.0)
|
|
(tS 0.0)
|
|
(oU 0.0)
|
|
(aU 0.0)
|
|
(A 0.0)
|
|
(ei 0.0)
|
|
(iU 0.0)
|
|
(U 0.0)
|
|
(@ 0.0)
|
|
(h 0.0)
|
|
(u 0.0)
|
|
(^ 0.0)
|
|
(I 0.0)
|
|
(dZ 0.0)
|
|
(i: 0.0)
|
|
(w 0.0)
|
|
(pau 0.0)
|
|
(brth 0.0)
|
|
(h# 0.0)
|
|
)))
|
|
|
|
(defvar phoneme_offsets* nil)
|
|
|
|
;;
|
|
;; Declare the new mode to Festival
|
|
;;
|
|
|
|
(set! tts_text_modes
|
|
(cons `(singing ;; mode name
|
|
((init_func ,singing_init_func)
|
|
(exit_func ,singing_exit_func)
|
|
(analysis_type xml)))
|
|
tts_text_modes))
|
|
|
|
(provide 'singing-mode)
|