/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                       Copyright (c) 1996,1997                         */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, modify, distribute this software and its    */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                      Author :  Alan W Black                           */
/*                      Date   :  April 1996                             */
/*-----------------------------------------------------------------------*/
/*                                                                       */
/* Some basic initialization functions for modules                       */
/*                                                                       */
/*=======================================================================*/
#include <stdio.h>
#include "festival.h"
#include "lexicon.h"
#include "modules.h"
#include "intonation.h"

static void create_words(EST_Utterance &u);
static void create_segments(EST_Utterance &u);
static void create_wave(EST_Utterance &u);
static void create_phones(EST_Utterance &u);

LISP FT_Initialize_Utt(LISP utt)
{
    // Main utterance intialization routine
    // creates appropriate streams and loads them from the input
    EST_Utterance *u = GETUTTVAL(utt);
    EST_String type;

    *cdebug << "Initialize module\n";

    type = utt_type(*u);

    utt_cleanup(*u);  // delete all streams except IForm

    if (type == "Words")
    {
	u->create_stream("Word");
	create_words(*u);
    }
    else if (type == "Text")
    {
	;
    }
    else if (type == "Segments")
    {
	u->create_stream("Segment");
	u->create_stream("Target");
	create_segments(*u);
    }
    else if (type == "Phones")
    {
	u->create_stream("Segment");
	create_phones(*u);
    }
    else if (type == "Phrase")
    {
	create_phraseinput(*u);
    }
    else if (type == "Wave")
    {
	u->create_stream("Wave");
	create_wave(*u);
    }
    else
    {
	// error
	cerr << "Unknown utterance type \"" << type << "\" for initialization "
	    << endl;
	festival_error();
    }

    return utt;
}

void create_words(EST_Utterance &u)
{
    // Add words from IForm
    LISP lwords,w;
    EST_Stream_Item *word;

    lwords = utt_iform(u);

    for (w=lwords; w != NIL; w=cdr(w))
    {
	if (consp(car(w)))  // word has features too
	{
	    word = add_word(u,get_c_string(car(car(w))));
	    add_item_features(*word,car(cdr(car(w))));
	}
	else
	    add_word(u,get_c_string(car(w)));
    }

}

static void gc_wave(void *w) { delete (EST_Wave *)w; }
void create_wave(EST_Utterance &u)
{
    // Get the fname for the wave and load it
    EST_Stream_Item item;
    LISP lwave;
    EST_Wave *wave = new EST_Wave;

    lwave = utt_iform(u);

    if (wave->load(get_c_string(lwave)) != format_ok)
    {
	cerr << "Cannot load wavefile: " << get_c_string(lwave) << endl;
	festival_error();
    }

    item.init("Wave");
    item.set_contents(wave,gc_wave);

    u.stream("Wave").append(item);

}

void create_segments(EST_Utterance &u)
{
    // Add segments from IForm
    LISP lsegs,s,targs,t;
    EST_String seg;
    EST_Stream_Item *Seg,*Target;
    float start,end,dur,tpos,tval;

    lsegs = utt_iform(u);

    end = 0.0;
    for (s=lsegs; s != NIL; s=cdr(s))
    {
	seg = get_c_string(car(car(s)));
	dur = get_c_float(car(cdr(car(s))));
	targs = cdr(cdr(car(s)));
	Seg = add_segment(u,seg);
	start = end;
	end += dur;
	Seg->set_end(end);
	for (t=targs; t != NIL; t=cdr(t))
	{
	    tpos = start + (get_c_float(car(car(t))));
	    tval = get_c_float(car(cdr(car(t))));
	    Target = add_target(u,tpos,tval);
	    link(*Target,*Seg);
	}
    }

}

static void create_phones(EST_Utterance &u)
{
    // Add phones from IForm
    LISP lsegs,s;
    EST_String seg;

    lsegs = utt_iform(u);

    for (s=lsegs; s != NIL; s=cdr(s))
    {
	seg = get_c_string(car(s));
	add_segment(u,seg);
    }
}

static LISP utt_segf0_load(LISP labfile, LISP f0file)
{
    // Load segments from a label file and F0 targets from an f0 file
    // into a new utterance which can be used for do resynthesis from
    // natural targets
    EST_Track f0;
    int i;
    EST_Stream_Item *s,*targ;
    LISP lutt;

    lutt =  cons(rintern("Utterance"),
		 cons(rintern("SegF0"),
		      cons(NIL,NIL)));

    lutt = leval(lutt,NIL);
    EST_Utterance *u = GETUTTVAL(lutt);

    u->create_stream("Segment");

    if (u->stream("Segment").load(get_c_string(labfile)) == wrong_format)
	festival_error();

    if (f0.load(get_c_string(f0file)) == wrong_format)
	festival_error();

    // Now need to create targets for point in EST_Track and relate them
    // to the appropriate segment
    u->create_stream("Target");
	
    for (i=0,s=u->stream("Segment").head(); i < f0.num_frames(); i++)
    {
	if ((f0.t(i) <= u->stream("Segment").tail()->end()) &&
	    (f0.a(i) > 5.0))
	{
	    targ = add_target(*u,f0.t(i),f0.a(i));
	    if ((f0.t(i) > s->end()) &&
		(s != u->stream("Segment").tail()))
		s=next(s);
	    link(*targ, *s);
	}
    }

    return lutt;
}

LISP FT_Initialize_Utt(LISP args);
LISP FT_Phrasify_Utt(LISP args);
LISP FT_Word_Utt(LISP args);
LISP FT_POS_Utt(LISP args);
LISP FT_PostLex_Utt(LISP utt);

void festival_base_init(void)
{
    // This I haven't put anywhere else yet

    // Basic EST_Utterance modules 
    festival_def_utt_module("Initialize",FT_Initialize_Utt,
    "(Initialize UTT)\n\
  This module should be called first on all utterances it does some\n\
  necessary initialization of the utterance and loads the base\n\
  streams with the information from the input form.");
    festival_def_utt_module("Phrasify",FT_Phrasify_Utt,
    "(Phrasify UTT)\n\
  Creates phrases from words, if pos_supported is non-nil, a more elaborate\n\
  system of prediction is used.  Here probability models based on part of\n\
  speech and B/NB distribution are used to predict breaks.  This system\n\
  uses standard Viterbi decoding techniques. If pos_supported is nil,\n\
  a simple CART-based prediction model is used. [see Phrase breaks]");
    festival_def_utt_module("Word",FT_Word_Utt,
    "(Word UTT)\n\
  Build the syllable and segment streams from the given words using the\n\
  Lexicon.  Uses part of speech information in the lexicon look up if\n\
  present.");
    festival_def_utt_module("POS",FT_POS_Utt,
    "(POS UTT)\n\
  Predict part of speech tags for the existing word stream.  If the variable\n\
  pos_lex_name is nil nothing happens, otherwise it is assumed to point to\n\
  a lexicon file giving part of speech distribution for words. An ngram\n\
  model file should be in pos_ngram_name.  The system uses standard\n\
  Viterbi decoding techniques. [see POS tagging]");
    festival_def_utt_module("Builtin_PostLex",FT_PostLex_Utt,
    "(Builtin_PostLex UTT)\n\
  Post-lexical rules.  Currently only vowel reduction applied to each\n\
  syllable using postlex_vowel_reduce_cart_tree, and the table of \n\
  vowel reduction pairs in postlex_vowel_reduce_table.");
    init_subr_2("utt.load.segf0", utt_segf0_load,
    "(utt.load.segf0 LABFILE F0FILE)\n\
  Create a new utterance from a label file and F0 file.  These will\n\
  usually be X-label format, but any formats supported by the Speech\n\
  Tool Library will work.  This function is designed specifically to\n\
  segments and target from natural utterances for resynthesis.");

    festival_def_ff("break","Word",ff_word_break,
    "Word.break\n\
  Returns break value for position immediately after word.  Is 1 if\n\
  intraphrasal or the phrase name if the word is at the end of the phrase.");

}
