480 lines
16 KiB
Scheme
480 lines
16 KiB
Scheme
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; ;;
|
|
;;; Carnegie Mellon University and ;;
|
|
;;; Centre for Speech Technology Research ;;
|
|
;;; University of Edinburgh, UK ;;
|
|
;;; Copyright (c) 1998-2005 ;;
|
|
;;; All Rights Reserved. ;;
|
|
;;; ;;
|
|
;;; Permission is hereby granted, free of charge, to use and distribute ;;
|
|
;;; this software and its documentation without restriction, including ;;
|
|
;;; without limitation the rights to use, copy, modify, merge, publish, ;;
|
|
;;; distribute, sublicense, and/or sell copies of this work, and to ;;
|
|
;;; permit persons to whom this work is furnished to do so, subject to ;;
|
|
;;; the following conditions: ;;
|
|
;;; 1. The code must retain the above copyright notice, this list of ;;
|
|
;;; conditions and the following disclaimer. ;;
|
|
;;; 2. Any modifications must be clearly marked as such. ;;
|
|
;;; 3. Original authors' names are not deleted. ;;
|
|
;;; 4. The authors' names are not used to endorse or promote products ;;
|
|
;;; derived from this software without specific prior written ;;
|
|
;;; permission. ;;
|
|
;;; ;;
|
|
;;; THE UNIVERSITY OF EDINBURGH, CARNEGIE MELLON UNIVERSITY AND THE ;;
|
|
;;; CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO ;;
|
|
;;; THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY ;;
|
|
;;; AND FITNESS, IN NO EVENT SHALL THE UNIVERSITY OF EDINBURGH, CARNEGIE ;;
|
|
;;; MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, ;;
|
|
;;; INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ;;
|
|
;;; RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION ;;
|
|
;;; OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;;
|
|
;;; OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;;
|
|
;;; ;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;
|
|
;;; Cluster Unit selection support (Black and Taylor Eurospeech '97)
|
|
;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;
|
|
;;; clunits build support
|
|
;;;
|
|
;;; There are five stages to this
|
|
;;; Load in all utterances
|
|
;;; Load in their coefficients
|
|
;;; Collect together the units of the same type
|
|
;;; build distance tables from them
|
|
;;; dump features for them
|
|
;;;
|
|
|
|
(require_module 'clunits) ;; C++ modules support
|
|
(require 'clunits) ;; run time scheme support
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(define (do_all)
|
|
(let ()
|
|
|
|
(format t "Loading utterances and sorting types\n")
|
|
(set! utterances (acost:db_utts_load clunits_params))
|
|
(set! unittypes (acost:find_same_types utterances clunits_params))
|
|
(acost:name_units unittypes)
|
|
|
|
(format t "Dumping features for clustering\n")
|
|
(acost:dump_features unittypes utterances clunits_params)
|
|
|
|
(format t "Loading coefficients\n")
|
|
(acost:utts_load_coeffs utterances)
|
|
;; If you are short of diskspace try this
|
|
(acost:disttabs_and_clusters unittypes clunits_params)
|
|
|
|
;; or if you have lots of diskspace try
|
|
; (format t "Building distance tables\n")
|
|
; (acost:build_disttabs unittypes clunits_params)
|
|
|
|
; ;; Build the cluster trees (requires disttabs and features)
|
|
; (format t "Building cluster trees\n")
|
|
; (acost:find_clusters (mapcar car unittypes) clunits_params)
|
|
|
|
;; Tidy up and put things together
|
|
(acost:collect_trees (mapcar car unittypes) clunits_params)
|
|
|
|
(format t "Saving unit catalogue\n")
|
|
(acost:save_catalogue utterances clunits_params)
|
|
|
|
)
|
|
)
|
|
|
|
(define (do_init)
|
|
(set! utterances (acost:db_utts_load clunits_params))
|
|
(set! unittypes (acost:find_same_types utterances clunits_params))
|
|
(acost:name_units unittypes)
|
|
t)
|
|
|
|
(define (acost:disttabs_and_clusters unittypes clunits_params)
|
|
"(acost:disttabs_and_custers unittypes)
|
|
Cause it uses so much diskspace, build each table individually
|
|
and them the cluster, removing the table before moving on to the
|
|
next."
|
|
(mapcar
|
|
(lambda (uu)
|
|
(acost:build_disttabs (list uu) clunits_params)
|
|
(acost:find_clusters (list (car uu)) clunits_params)
|
|
(delete-file
|
|
(format nil "%s/%s/%s%s"
|
|
(get_param 'db_dir clunits_params "./")
|
|
(get_param 'disttabs_dir clunits_params "disttabs/")
|
|
(car uu)
|
|
(get_param 'disttabs_ext clunits_params ".disttab")))
|
|
)
|
|
unittypes)
|
|
t)
|
|
|
|
(define (acost:db_utts_load params)
|
|
"(acost:db_utts_load params)
|
|
Load in all utterances identified in database."
|
|
(let ((files (car (cdr (assoc 'files params)))))
|
|
(set! acost:all_utts
|
|
(mapcar
|
|
(lambda (fname)
|
|
(set! utt_seg (Utterance Text fname))
|
|
(utt.load utt_seg
|
|
(string-append
|
|
(get_param 'db_dir params "./")
|
|
(get_param 'utts_dir params "festival/utts/")
|
|
fname
|
|
(get_param 'utts_ext params ".utt")))
|
|
utt_seg)
|
|
files))))
|
|
|
|
(define (acost:utts_load_coeffs utterances)
|
|
"(acost:utts_load_coeffs utterances)
|
|
Loading the acoustic coefficients of for each utterance."
|
|
(mapcar
|
|
(lambda (utt) (acost:utt.load_coeffs utt clunits_params))
|
|
utterances)
|
|
t)
|
|
|
|
(define (acost:find_same_types utterances params)
|
|
"(acost:find_same_types utterances)
|
|
Find all the stream items of the same type and collect them into
|
|
lists of that type."
|
|
(let ((clunit_name_feat (get_param 'clunit_name_feat params "name"))
|
|
(clunit_relation (get_param 'clunit_relation params "Segment")))
|
|
(set! acost:unittypes nil)
|
|
(mapcar
|
|
(lambda (u)
|
|
(mapcar
|
|
(lambda (s)
|
|
(let ((cname (item.feat s clunit_name_feat)))
|
|
(if (not (string-equal "ignore" cname))
|
|
(begin
|
|
(item.set_feat s "clunit_name" (item.feat s clunit_name_feat))
|
|
(let ((p (assoc (item.feat s "clunit_name") acost:unittypes)))
|
|
(if p
|
|
(set-cdr! p (cons s (cdr p)))
|
|
(set! acost:unittypes
|
|
(cons
|
|
(list (item.feat s "clunit_name") s)
|
|
acost:unittypes))))))))
|
|
(utt.relation.items u clunit_relation)))
|
|
utterances)
|
|
(acost:prune_unittypes acost:unittypes params)))
|
|
|
|
(define (acost:prune_unittypes unittypes params)
|
|
"(acost:prune_unittypes unittypes)
|
|
If unit types are complex (contain an _) then remove all unittypes sets
|
|
with less than unittype_prune_threshold (typically 3)."
|
|
(if (string-matches (car (car unittypes)) ".*_.*")
|
|
(let ((ut nil) (pt (get_param 'unittype_prune_threshold params 0)))
|
|
(while unittypes
|
|
(if (or (eq? pt 0)
|
|
(> (length (cdr (car unittypes))) pt))
|
|
(set! ut (cons (car unittypes) ut)))
|
|
(set! unittypes (cdr unittypes)))
|
|
(reverse ut))
|
|
unittypes))
|
|
|
|
(define (acost:name_units unittypes)
|
|
"(acost:name_units unittypes)
|
|
Names each unit with a unique id and number the occurrences of each type."
|
|
(let ((idnum 0) (tynum 0))
|
|
(mapcar
|
|
(lambda (s)
|
|
(set! tynum 0)
|
|
(mapcar
|
|
(lambda (si)
|
|
(item.set_feat si "unitid" idnum)
|
|
(set! idnum (+ 1 idnum))
|
|
(item.set_feat si "occurid" tynum)
|
|
(set! tynum (+ 1 tynum)))
|
|
(cdr s))
|
|
(format t "units \"%s\" %d\n" (car s) tynum))
|
|
unittypes)
|
|
(format t "total units %d\n" idnum)
|
|
idnum))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Generating feature files
|
|
|
|
(define (acost:dump_features unittypes utterances params)
|
|
"(acost:dump_features unittypes utterances params)
|
|
Do multiple passes over the utterances for each unittype and
|
|
dump the desired features. This would be easier if utterances
|
|
weren't require for feature functions."
|
|
(mapcar
|
|
(lambda (utype)
|
|
(acost:dump_features_utype
|
|
(car utype)
|
|
(cdr utype)
|
|
utterances
|
|
params))
|
|
unittypes)
|
|
t)
|
|
|
|
(define (acost:dump_features_utype utype uitems utterances params)
|
|
"(acost:dump_features_utype utype utterances params)
|
|
Dump features for all items of type utype."
|
|
(let ((fd (fopen
|
|
(string-append
|
|
(get_param 'db_dir params "./")
|
|
(get_param 'feats_dir params "festival/feats/")
|
|
utype
|
|
(get_param 'feats_ext params ".feats"))
|
|
"w"))
|
|
(feats (car (cdr (assoc 'feats params)))))
|
|
(format t "Dumping features for %s\n" utype)
|
|
(mapcar
|
|
(lambda (s)
|
|
(mapcar
|
|
(lambda (f)
|
|
(set! fval (unwind-protect (item.feat s f) "0"))
|
|
(if (or (string-equal "" fval)
|
|
(string-equal " " fval))
|
|
(format fd "%l " fval)
|
|
(format fd "%s " fval)))
|
|
feats)
|
|
(format fd "\n"))
|
|
uitems)
|
|
(fclose fd)))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Tree building functions
|
|
|
|
(defvar wagon-balance-size 0)
|
|
|
|
(define (acost:find_clusters unittypes clunits_params)
|
|
"Use wagon to find the best clusters."
|
|
(mapcar
|
|
(lambda (unittype)
|
|
(build_tree unittype clunits_params))
|
|
unittypes)
|
|
t)
|
|
|
|
(define (build_tree unittype clunits_params)
|
|
"Build tree with Wagon for this unittype."
|
|
(let ((command
|
|
(format nil "%s -desc %s -data '%s' -balance %s -distmatrix '%s' -stop %s -output '%s' %s"
|
|
(get_param 'wagon_progname clunits_params "wagon")
|
|
(if (probe_file
|
|
(string-append
|
|
(get_param 'db_dir clunits_params "./")
|
|
(get_param 'wagon_field_desc clunits_params "wagon")
|
|
"." unittype))
|
|
;; So there can be unittype specific desc files
|
|
(string-append
|
|
(get_param 'db_dir clunits_params "./")
|
|
(get_param 'wagon_field_desc clunits_params "wagon")
|
|
"." unittype)
|
|
(string-append
|
|
(get_param 'db_dir clunits_params "./")
|
|
(get_param 'wagon_field_desc clunits_params "wagon")))
|
|
(string-append
|
|
(get_param 'db_dir clunits_params "./")
|
|
(get_param 'feats_dir clunits_params "festival/feats/")
|
|
unittype
|
|
(get_param 'feats_ext clunits_params ".feats"))
|
|
(get_param 'wagon_balance_size clunits_params 0)
|
|
(string-append
|
|
(get_param 'db_dir clunits_params "./")
|
|
(get_param 'disttabs_dir clunits_params "festival/disttabs/")
|
|
unittype
|
|
(get_param 'disttabs_ext clunits_params ".disttab"))
|
|
(get_param 'wagon_cluster_size clunits_params 10)
|
|
(string-append
|
|
(get_param 'db_dir clunits_params "./")
|
|
(get_param 'trees_dir clunits_params "festival/trees/")
|
|
unittype
|
|
(get_param 'trees_ext clunits_params ".tree"))
|
|
(get_param 'wagon_other_params clunits_params "")
|
|
)))
|
|
(format t "%s\n" command)
|
|
(system command)))
|
|
|
|
(defvar clunits_tree_minimum_leafs 0)
|
|
(define (acost:collect_trees unittypes params)
|
|
"Collect the trees into one file as an assoc list"
|
|
(let ((fd (fopen
|
|
(string-append
|
|
(get_param 'db_dir params "./")
|
|
(get_param 'trees_dir params "festival/trees/")
|
|
(get_param 'index_name params "all.")
|
|
(get_param 'trees_ext params ".tree"))
|
|
"wb"))
|
|
(tree_pref
|
|
(string-append
|
|
(get_param 'db_dir params "./")
|
|
(get_param 'trees_dir params "festival/trees/")))
|
|
(cluster_prune_limit (get_param 'cluster_prune_limit params 0))
|
|
(cluster_merge (get_param 'cluster_merge params 0)))
|
|
(format fd ";; Autogenerated list of selection trees\n")
|
|
(mapcar
|
|
(lambda (fp)
|
|
(format fd ";; %l %l\n" (car fp) (car (cdr fp))))
|
|
params)
|
|
(format fd "(set! clunits_selection_trees '(\n")
|
|
(mapcar
|
|
(lambda (unit)
|
|
(set! tree (car (load (string-append tree_pref unit ".tree") t)))
|
|
(if (> cluster_prune_limit 0)
|
|
(set! tree (cluster_tree_prune tree cluster_prune_limit)))
|
|
(if (> cluster_merge 0)
|
|
(set! tree (tree_merge_leafs tree cluster_merge)))
|
|
(if (boundp 'temp_tree_convert)
|
|
(set! tree (temp_tree_convert)))
|
|
(if (> (tree_num_units tree) clunits_tree_minimum_leafs)
|
|
(pprintf (list unit tree) fd)))
|
|
unittypes)
|
|
(format fd "))\n")
|
|
(fclose fd)))
|
|
|
|
(define (cluster_tree_prune_in_line prune_limit)
|
|
"(cluster_tree_prune_in_line)
|
|
Prune number of units in each cluster in each tree *by* prune_limit,
|
|
if negative, or *to* prune_limit, if positive."
|
|
(set! sucs_select_trees
|
|
(mapcar
|
|
(lambda (t)
|
|
(cluster_tree_prune t prune_limit))
|
|
sucs_select_trees)))
|
|
|
|
(define (tree_merge_leafs tree depth)
|
|
"(tree_merge_leafs tree depth)
|
|
Merge the leafs of the tree at goven depth. This allows the trees
|
|
to be pruned then the single leafs joined together into larger
|
|
clusters (so the viterbi part has something to do)."
|
|
(let ((num_leafs (tree_num_leafs tree)))
|
|
(cond
|
|
((< num_leafs 2) tree) ;; already at the foot
|
|
((< num_leafs depth)
|
|
(tree_collect_leafs tree))
|
|
(t
|
|
(list
|
|
(car tree)
|
|
(tree_merge_leafs (car (cdr tree)) depth)
|
|
(tree_merge_leafs (car (cdr (cdr tree))) depth))))))
|
|
|
|
(define (tree_num_leafs tree)
|
|
"(tree_num_leafs tree)
|
|
Number of leafs of given tree."
|
|
(cond
|
|
((cdr tree)
|
|
(+
|
|
(tree_num_leafs (car (cdr tree)))
|
|
(tree_num_leafs (car (cdr (cdr tree))))))
|
|
(t
|
|
1)))
|
|
|
|
(define (tree_num_units tree)
|
|
"(tree_num_units tree)
|
|
Number of leafs of given tree."
|
|
(cond
|
|
((cdr tree)
|
|
(+
|
|
(tree_num_units (car (cdr tree)))
|
|
(tree_num_units (car (cdr (cdr tree))))))
|
|
(t
|
|
(length (caar tree))
|
|
)))
|
|
|
|
(define (tree_collect_leafs tree)
|
|
"(tree_collect_leafs tree)
|
|
Combine all units in the leafs."
|
|
(cond
|
|
((cdr tree)
|
|
(let ((a (tree_collect_leafs (car (cdr tree))))
|
|
(b (tree_collect_leafs (car (cdr (cdr tree))))))
|
|
(list
|
|
(list
|
|
(append
|
|
(caar a)
|
|
(caar b))
|
|
10.0))))
|
|
(t
|
|
tree)))
|
|
|
|
(define (cluster_tree_prune tree prune_limit)
|
|
"(cluster_tree_prune TREE PRUNE_LIMIT)
|
|
Reduce the number of elements in the (CART) tree leaves to PRUNE_LIMIT
|
|
removing the ones further from the cluster centre. Maybe later this should
|
|
have guards on minimum number of units that must remain in the tree and
|
|
a per unit type limit."
|
|
(cond
|
|
((cdr tree) ;; a question
|
|
(list
|
|
(car tree)
|
|
(cluster_tree_prune (car (cdr tree)) prune_limit)
|
|
(cluster_tree_prune (car (cdr (cdr tree))) prune_limit)))
|
|
(t ;; tree leave
|
|
(list
|
|
(list
|
|
(remove_n_worst
|
|
(car (car tree))
|
|
(if (< prune_limit 0)
|
|
(* -1 prune_limit)
|
|
(- (length (car (car tree))) prune_limit)))
|
|
(car (cdr (car tree))))))))
|
|
|
|
(define (remove_n_worst lll togo)
|
|
"(remove_n_worst lll togo)
|
|
Remove togo worst items from lll."
|
|
(cond
|
|
((< togo 0)
|
|
lll)
|
|
((equal? 0 togo)
|
|
lll)
|
|
(t
|
|
(remove_n_worst
|
|
(remove (worst_unit (cdr lll) (car lll)) lll)
|
|
(- togo 1)))))
|
|
|
|
(define (worst_unit lll worst_so_far)
|
|
"(worst_unit lll worst_so_far)
|
|
Returns unit with worst score in list."
|
|
(cond
|
|
((null lll)
|
|
worst_so_far)
|
|
((< (car (cdr worst_so_far)) (car (cdr (car lll))))
|
|
(worst_unit (cdr lll) (car lll)))
|
|
(t
|
|
(worst_unit (cdr lll) worst_so_far))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Save the unit catalogue for use in the run-time index
|
|
|
|
(define (acost:save_catalogue utterances clunits_params)
|
|
"(acost:save_catalogue utterances clunits_params)
|
|
Save the catalogue with named units with times."
|
|
(let ((fd (fopen
|
|
(string-append
|
|
(get_param 'db_dir clunits_params "./")
|
|
(get_param 'catalogue_dir clunits_params "trees/")
|
|
(get_param 'index_name clunits_params "catalogue.")
|
|
".catalogue")
|
|
"wb"))
|
|
(num_units 0)
|
|
)
|
|
(format fd "EST_File index\n")
|
|
(format fd "DataType ascii\n")
|
|
(format fd "NumEntries %d\n"
|
|
(apply
|
|
+ (mapcar (lambda (u)
|
|
(length (utt.relation.items u 'Segment))) utterances)))
|
|
(format fd "IndexName %s\n" (get_param 'index_name clunits_params "cluser"))
|
|
(format fd "EST_Header_End\n")
|
|
(mapcar
|
|
(lambda (u)
|
|
(mapcar
|
|
(lambda (s)
|
|
(format fd "%s_%s %s %f %f %f\n"
|
|
(item.feat s "clunit_name")
|
|
(item.feat s 'occurid)
|
|
(utt.feat u 'fileid)
|
|
(item.feat s 'segment_start)
|
|
(item.feat s 'segment_mid)
|
|
(item.feat s 'segment_end)))
|
|
(utt.relation.items u 'Segment)))
|
|
utterances)
|
|
(fclose fd)))
|
|
|
|
(provide 'clunits_build.scm)
|