speech-tools/main/pda_main.cc

/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                         Copyright (c) 1996                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                    Author :  Paul Taylor                              */
/*                    Date   :  May 1994                                 */
/*-----------------------------------------------------------------------*/
/*             Pitch Detection Algorithm Main routine                    */
/*                                                                       */
/*=======================================================================*/
#include <fstream>
#include "EST.h"
#include "sigpr/EST_sigpr_utt.h"
#include "EST_cmd_line_options.h"

void set_parameters(EST_Features &a_list, EST_Option &al);

void option_override(EST_Features &op, EST_Option al,
		     const EST_String &option, const EST_String &arg);

static int save_pm(EST_String filename, EST_Track fz);

/** @name <command>pda</command> <emphasis>Pitch Detection Algorithm</emphasis>
    @id pda-manual
  * @toc
 */

//@{

/**@name Synopsis
  */
//@{

//@synopsis

/**
pda is a pitch detection algorithm that produces a fundamental frequency
contour from a speech waveform file. At present only the
super resolution pitch determination algorithm is implemented.
See (Medan, Yair, and Chazan, 1991) and (Bagshaw et al., 1993) for a detailed
description of the algorithm.
</para><para>

The default values given below were found to optimise the performance
of the pitch determination algorithm for speech data sampled at 20kHz
using a 16\-bit waveform and low pass filter with a 600Hz cut-off
frequency and more than \-85dB rejection above 700Hz. The best
performances occur if the [\-p] flag is passed.  </para><para>
*/

//@}

/**@name Options
  */
//@{

//@options

//@}


int main (int argc, char *argv[])
{
    EST_Track fz;
    EST_Wave sig;
    EST_Option al;
    EST_Features op;
    EST_String out_file("-");
    EST_StrList files;

    parse_command_line
	(argc, argv,
       EST_String("[input file] -o [output file] [options]\n")+
       "Summary: pitch track waveform files\n"
       "use \"-\" to make input and output files stdin/out\n"
       "-h               Options help\n\n"+
       options_wave_input()+
       options_pda_general()+
       options_pda_srpd()+
       options_track_output(),
			files, al);

    default_pda_options(op);
    set_parameters(op, al);

    if (read_wave(sig, files.first(), al) != format_ok)
	exit(-1);

    out_file = al.present("-o") ? al.val("-o") : (EST_String)"-";

    pda(sig, fz, op);		// do f0 tracking

    if (al.present("-pm"))
	save_pm(out_file, fz);
    else
	fz.save(out_file, op.S("f0_file_type", "0"));

    if (al.present("-diff"))
    {
	fz = differentiate(fz);
	fz.save(out_file + ".diff", op.S("f0_file_type", "0"));
    }
    return 0;
}


void set_parameters(EST_Features &op, EST_Option &al)
{
    op.set("srpd_resize", 1);

    // general options
    option_override(op, al, "pda_frame_shift", "-shift");
    option_override(op, al, "pda_frame_length", "-length");
    option_override(op, al, "max_pitch", "-fmax");
    option_override(op, al, "min_pitch", "-fmin");

    // low pass filtering options.
    option_override(op, al, "lpf_cutoff", "-u");
    option_override(op, al, "lpf_order", "-forder");

    option_override(op, al, "decimation", "-d");
    option_override(op, al, "noise_floor", "-n");
    option_override(op, al, "min_v2uv_coef_thresh", "-m");
    option_override(op, al, "v2uv_coef_thresh_ratio", "-R");
    option_override(op, al, "v2uv_coef_thresh", "-H");
    option_override(op, al, "anti_doubling_thresh", "-t");
    option_override(op, al, "peak_tracking", "-P");

    option_override(op, al, "f0_file_type", "-otype");
    option_override(op, al, "wave_file_type", "-itype");

    if (al.val("-L", 0) == "true")
	op.set("do_low_pass", "true");
    if (al.val("-R", 0) == "true")
	op.set("do_low_pass", "false");


/*    op.set("lpf_cutoff",al.val("-u", 0));
    op.set("lpf_order",al.val("-forder", 0));

    //sprd options
    op.set("decimation", al.val("-d", 0));
    op.set("noise_floor",   al.val("-n", 0));
    op.set("min_v2uv_coef_thresh", al.val("-m", 0));
    op.set("v2uv_coef_thresh_ratio", al.val("-r", 0));
    op.set("v2uv_coef_thresh", al.val("-H", 0));
    op.set("anti_doubling_thresh", al.val("-t", 0));
    op.set("peak_tracking", al.val("-P", 0));
    if (al.val("-L", 0) == "true")
	op.set("do_low_pass", "true");
    if (al.val("-R", 0) == "true")
	op.set("do_low_pass", "false");
    op.set("f0_file_type", al.val("-otype", 0));
    op.set("wave_file_type", al.val("-itype", 0));
*/
}

/*    a_list.override_val("sample_rate", al.val("-f", 0));
    a_list.override_val("min_pitch",  al.val("-fmin", 0));
    a_list.override_val("max_pitch",  al.val("-fmax", 0));
    a_list.override_val("pda_frame_shift", al.val("-s", 0));
    a_list.override_val("pda_frame_length",al.val("-l", 0));

    // low pass filtering options.
    a_list.override_val("lpf_cutoff",al.val("-u", 0));
    a_list.override_val("lpf_order",al.val("-forder", 0));

    //sprd options
    a_list.override_val("decimation", al.val("-d", 0));
    a_list.override_val("noise_floor",   al.val("-n", 0));
    a_list.override_val("min_v2uv_coef_thresh", al.val("-m", 0));
    a_list.override_val("v2uv_coef_thresh_ratio", al.val("-r", 0));
    a_list.override_val("v2uv_coef_thresh", al.val("-H", 0));
    a_list.override_val("anti_doubling_thresh", al.val("-t", 0));
    a_list.override_val("peak_tracking", al.val("-P", 0));
    if (al.val("-L", 0) == "true")
	a_list.override_val("do_low_pass", "true");
    if (al.val("-R", 0) == "true")
	a_list.override_val("do_low_pass", "false");
    a_list.override_val("f0_file_type", al.val("-otype", 0));
    a_list.override_val("wave_file_type", al.val("-itype", 0));
*/


static int save_pm(EST_String filename, EST_Track fz)
{
    ostream *outf;
    float position, period;

    if (filename == "-")
	outf = &cout;
    else
	outf = new ofstream(filename);

    if (!(*outf))
    {
	cerr << "save_pm: can't write to file \"" << filename << "\"" << endl;
	return -1;
    }

    *outf << "XAO1\n\n";	// xmg header identifier.
    *outf << "LineType        bars \n";
    *outf << "LineStyle       solid \n";
    *outf << "LineWidth       0 \n";
    *outf << "Freq 16\n";
    *outf << "Format  Binary \n";
    *outf << char(12) << "\n";	// control L character

    position = 0.0;
    int gap = 0;
    for (int i = 0; i < fz.num_frames(); ++i)
    {
	if (fz.val(i))
	{
	    if (gap)
	    {
		position = fz.t(i);
		gap = 0;
	    }
	    period = 1.0 / fz.a(i);
	    *outf << (position + period) * 1000.0 << endl;
	    position += period;
	}
	else
	    gap = 1;
    }

    if (outf != &cout)
	delete outf;

    return 0;
}

/**@name Examples

Pitch detection on typical male voice, using low pass filtering:
<screen>
$ pda kdt_010.wav -o kdt_010.f0 -fmin 80 -fmax 200 -L
</screen>
*/
//@{

//@}
//@}