BibTeX entries in bib/att.bib
@MISC{att:www,
key = {ATT},
title = {{AT\&T Labs, Oregon Graduate Institute of Science and Technology}},
howpublished = {WWW page},
year = {1999},
url = {http://www.research.att.com/projects/tts/},
note = {\url{http://www.research.att.com/projects/tts/}},
}
@INPROCEEDINGS{att:nextgen99,
author = {M. Beutnagel and A. Conkie and J. Schroeter and Y. Stylianou and A. Syrdal},
title = {{The AT\&T Next-Gen {TTS} System}},
booktitle = {Joint Meeting of ASA, EAA, and DAGA},
address = {Berlin, Germany},
month = {March},
year = {1999},
note = {\bibatturl},
remarks = {},
abstract = {The new AT\&T Text-To-Speech (TTS) system for general U.S. English text is based on best-choice components of the AT\&T Flextalk TTS, the Festival System from the University of Edinburgh, and ATR's CHATR system. From Flextalk, it employs text normalization, letter-to-sound, and prosody generation. Festival provides a flexible and modular architecture for easy experimentation and competitive evaluation of different algorithms or modules. In addition, we adopted CHATR's unit selection algorithms and modified them in an attempt to guarantee high intelligibility under all circumstances. Finally, we have added our own Harmonic plus Noise Model (HNM) backend for synthesizing the output speech. Most decisions made during the research and development phase of this system were based on formal subjective evaluations. We feel that the new system goes a long way toward delivering on the long-standing promise of truly natural-sounding, as well as highly intelligible, synthesis.},
}
@INPROCEEDINGS{att:diph-select98,
author = {Mark Beutnagel and Alistair Conkie and Ann K. Syrdal},
title = {{Diphone Synthesis using Unit Selection}},
booktitle = {The 3rd ESCA/COCOSDA Workshop on Speech Synthesis},
address = {Jenolan Caves, Australia},
month = {November},
year = {1998},
note = {\bibatturl},
remarks = {Summary: CHATR unit selection (using phone units) extended to diphones. Open synthesis backend: PSOLA, HNM, wave concat. Uses standard Festival. Careful listening test examining influence on quality of synthesis/unit type/pruning. Base for Next-Gen TTS \cite{att:nextgen99}?},
abstract = {This paper describes an experimental AT\&T concatenative synthesis system using unit selection, for which the basic synthesis units are diphones. The synthesizer may use any of the data from a large database of utterances. Since there are in general multiple instances of each concatenative unit, the system performs dynamic unit selection. Selection among candidates is done dynamically at synthesis, in a manner that is based on and extends unit selection implemented in the CHATR synthesis system [1][4]. Selected units may be either phones or diphones, and they can be synthesized by a variety of methods, including PSOLA [5], HNM [11], and simple unit concatenation. The AT\&T system, with CHATR unit selection, was implemented within the framework of the Festival Speech Synthesis System [2]. The voice database amounted to approximately one and one-half hours of speech and was constructed from read text taken from three sources. The first source was a portion of the 1989 Wall Street Journal material from the Penn Treebank Project, so that the most frequent diphones were well represented. Complete diphone converage was assured by the second text, which was designed for diphone databases [12]. A third set of data consisted of recorded prompts for telephone service applications. Subjective formal listening tests were conducted to compare speech quality for several options that exist in the AT\&T synthesizer, including synthesis methods and choices of fundamental units. These tests showed that unit selection techniques can be successfully applied to diphone synthesis.},
}
@INPROCEEDINGS{att:HNM98,
author = {Yannis Stylianou},
title = {{Concatenative Speech Synthesis using a Harmonic plus Noise Model}},
booktitle = {The 3rd ESCA/COCOSDA Workshop on Speech Synthesis},
address = {Jenolan Caves, Australia},
month = {November},
year = {1998},
note = {\bibatturl},
remarks = {},
abstract = {This paper describes the application of the Harmonic plus Noise Model, HNM, for concatenative Text-to-Speech (TTS) synthesis. In the context of HNM, speech signals are represented as a time-varying harmonic component plus a modulated noise component. The decomposition of speech signal in these two components allows for more natural-sounding modifications (e.g., source and filter modifications) of the signal. The parametric representation of speech using HNM provides a straightforward way of smoothing discontinuities of acoustic units around concatenation points. Formal listening tests have shown that HNM provides high-quality speech synthesis while outperforming other models for synthesis (e.g., TD-PSOLA) in intelligibility, naturalness and pleasantness.},
}
@INPROCEEDINGS{att:ph98,
author = {Yannis Stylianou},
title = {{Removing Phase Mismatches in Concatenative Speech Synthesis}},
booktitle = {The 3rd ESCA/COCOSDA Workshop on Speech Synthesis},
address = {Jenolan Caves, Australia},
month = {November},
year = {1998},
note = {\bibatturl},
remarks = {},
abstract = {Concatenation of acoustic units is widely used in most of the currently available text-to-speech systems. While this approach leads to higher intelligibility and naturalness than synthesis-by-rule, it has to cope with the issues of concatenating acoustic units that have been recorded in a different order. One important issue in concatenation is that of synchronization of speech frames or, in other words, inter-frame coherence. This paper presents a novel method for synchronization of signals with applications to speech synthesis. The method is based on the notion of center of gravity applied to speech signals. It is an off-line approach as this can be done during analysis with no computational burden on synthesis. The method has been tested with the Harmonic plus Noise Model, HNM, on many large speech databases. The resulting synthetic speech is free of phase mismatch (inter-frame incoherence) problems.},
}
@INPROCEEDINGS{att:Yang98,
author = {Ping-Fai Yang and Yannis Stylianou},
title = {Real Time Voice Alteration Based on Linear Prediction},
year = {1998},
booktitle = {Proc.~ICSLP98},
note = {\bibatturl},
remarks = {},
abstract = {},
}
@INPROCEEDINGS{att:Syrdal98,
author = {Ann K. Syrdal and Alistair Conkie and Yannis Stylianou},
title = {Exploration of Acoustic Correlates in Speaker Selection for Concatenative Synthesis},
year = {1998},
booktitle = {Proc.~ICSLP98},
note = {\bibatturl},
remarks = {},
abstract = {},
}
@INPROCEEDINGS{att:Ostermann98,
author = {J{\"o}rn Ostermann and Mark C. Beutnagel and Ariel Fischer and Yao Wang},
title = {Integration Of Talking Heads And Text-To-Speech Synthesizers For Visual TTS},
year = {1998},
booktitle = {Proc.~ICSLP98},
note = {\bibatturl},
remarks = {},
abstract = {},
}
@INPROCEEDINGS{att:paperSYN98,
author = {Ann K Syrdal and Yannis G Stylianou and Laurie F Garrison and Alistair Conkie and Juergen Schroeter},
title = {TD-PSOLA versus Harmonic Plus Noise Model in Diphone Based Speech Synthesis},
year = {1998},
booktitle = {Proc.~ICASSP98},
pages = {273--276},
note = {\bibatturl},
remarks = {},
abstract = {In an effort to select a speech representation for our next generation concatenative text-to-speech synthesizer, the use of two candidates is investigated; TD-PSOLA and the Harmonic plus Noise Model, HNM. A formal listening test has been conducted and the two candidates have been rated regarding intelligibility, naturalness and pleasantness. Ability for database compression and computational load is also discussed. The results show that HNM consistently outperforms TD-PSOLA in all the above features except for computational load. HNM allows for high-quality speech synthesis without smoothing problems at the segmental boundaries and without buzziness or other oddities observed with TD-PSOLA.},
}
This document was translated from LATEX by
HEVEA.