BibTeX entries in bib/cslu.bib

@MISC{cslu:www,
  key           = {{\relax CSLU}},
  title         = {{CSLU Speech Synthesis Research Group, Oregon Graduate Institute of Science and Technology}},
  howpublished  = {WWW page},
  year          = {1999},
  url           = {http://cslu.cse.ogi.edu/tts},
  pub-url       = {http://cslu.cse.ogi.edu/tts/publications},
  note          = {\url{http://cslu.cse.ogi.edu/tts}},
}
@ARTICLE{cslu:ieeetsap98,
  author        = {F. Kossentini and M. Macon and M. Smith},
  title         = {Audio coding using variable-depth multistage quantization},
  booktitle     = {IEEE Transactions on Speech and Audio Processing},
  volume        = {6},
  year          = {1998},
  note          = {\bibcsluurl},
}
@INPROCEEDINGS{cslu:esca98mm,
  author        = {M. W. Macon and A. E. Cronk and J. Wouters},
  title         = {Generalization and Discrimination in tree-structured unit selection},
  booktitle     = {Proceedings of the 3rd ESCA/COCOSDA International Speech Synthesis Workshop},
  month         = {November},
  year          = {1998},
  note          = {\bibcsluurl},
  remarks       = {Great overview of several unit selection methods, comprehensive biliography: origin of unit selection? \cite{sagisaka88}. festival unit selection \cite{cstr:unitsel96,cstr:eursp95}. classification and regression trees \cite{cart84}. clustering and decision trees \cite{cstr:unitsel97,wang93,nakajima94}. Mahalanobis distance \cite{donovan96}. decision trees for: speech recognition \cite{nock97}, speech synthesis \cite{huang96}. data driven direct mapping with ANN \cite{karaali96,tuerk93}. distance measures for: coding \cite{quackenbush88}, ASR \cite{nocerino85, asp:icassp88}, in general \cite{ghitza97}, concatenative speech synthesis \cite{hansen98,cslu:icslp98-paper}. PLP: \cite{asp:itsa94}. Linear regression and correlation, Fisher transform: \cite{edwards93}. Tree pruning: \cite{cslu:icslp98cronk}. Masking effects: \cite{moore89}.},
  abstract      = {Concatenative ``selection-based'' synthesis from large databases has emerged as a viable framework for TTS waveform generation. Unit selection algorithms attempt to predict the appropriateness of a particular database speech segment using only linguistic features output by text analysis and prosody prediction components of a synthesizer. All of these algorithms have in common a training or ``learning'' phase in which parameters are trained to select appropriate waveform segments for a given feature vector input. One approach to this step is to partition available data into clusters that can be indexed by linguistic features available at runtime. This method relies critically on two important principles: discrimination of fine phonetic details using a perceptually-motivated distance measure in training and generalization to unseen cases in selection. In this paper, we describe efforts to systematically investigate and improve these parts of the process.},
}
@INPROCEEDINGS{cslu:esca98kain,
  author        = {A. Kain and M. W. Macon},
  title         = {Personalizing a speech synthesizer by voice adaptation},
  booktitle     = {Proceedings of the 3rd ESCA/COCOSDA International Speech Synthesis Workshop},
  month         = {November},
  year          = {1998},
  pages         = {225--230},
  note          = {\bibcsluurl},
  abstract      = {A voice adaptation system enables users to quickly create new voices for a text-to-speech system, allowing for the personalization of the synthesis output. The system adapts to the pitch and spectrum of the target speaker, using a probabilistic, locally linear conversion function based on a Gaussian Mixture Model. Numerical and perceptual evaluations reveal insights into the correlation between adaptation quality and the amount of training data, the number of free parameters. A new joint density estimation algorithm is compared to a previous approach. Numerical errors are studied on the basis of broad phonetic categories. A data augmentation method for training data with incomplete phonetic coverage is investigated and found to maintain high speech quality while partially adapting to the target voice.},
}
@INPROCEEDINGS{cslu:icslp98cronk,
  author        = {Andrew E. Cronk and Michael W. Macon},
  title         = {{Optimized Stopping Criteria for Tree-Based Unit Selection in Concatenative Synthesis}},
  oldtitle      = {Optimization of stopping criteria for tree-structured unit selection},
  booktitle     = {Proc. of International Conference on Spoken Language Processing},
  volume        = {5},
  month         = {November},
  year          = {1998},
  pages         = {1951--1955},
  note          = {\bibcsluurl},
  remarks       = {Summary: Method for growing optimal clustering tree (CART, as in \cite{cart84}). Not stopping with thresholds, but growing three completely (until no splittable clusters are left), and then pruning by recombining clusters by a greedy algorithm. Gives evaluation measure \textit{V-fold cross validation} for tree quality. Clusters represent units with equivalent target cost. A best split of a cluster maximizes the decrease in data impurity (lower within-cluster variance of acoustic features). N.B.: Clustering of units is not classification, as the classes are not known in advance, and the method is unsupervised! Weighting in distortion measure using Mahalanobis distance as the inverse of the variance. References: \cite{cstr:eursp95}, \cite{cstr:unitsel97}, \cite{cart84}, \cite{donovan96}, \cite{fukunaga90} (CART tree evaluation criterion), \cite{nock97}, \cite{nakajima94}, \cite{wang93}.},
}
@INPROCEEDINGS{cslu:icslp98kain,
  author        = {A. Kain and M. W. Macon},
  title         = {Text-to-speech voice adaptation from sparse training data},
  booktitle     = {Proc. of International Conference on Spoken Language Processing},
  month         = {November},
  year          = {1998},
  pages         = {2847--2850},
  note          = {\bibcsluurl},
}
@INPROCEEDINGS{cslu:icslp98-paper,
  author        = {J. Wouters and M. W. Macon},
  title         = {A Perceptual Evaluation of Distance Measures for Concatenative Speech Synthesis},
  booktitle     = {Proc. of International Conference on Spoken Language Processing},
  month         = {November},
  year          = {1998},
  note          = {\bibcsluurl},
  abstract      = {In concatenative synthesis, new utterances are created by concatenating segments (units) of recorded speech. When the segments are extracted from a large speech corpus, a key issue is to select segments that will sound natural in a given phonetic context. Distance measures are often used for this task. However, little is known about the perceptual relevance of these measures. More insightinto the relationship between computed distances and perceptual differences is needed to develop accurate unit selection algorithms, and to improve the quality of the resulting computer speech. In this paper, we develop a perceptual test to measure subtle phonetic differences between speech units. We use the perceptual data to evaluate several popular distance measures. The results show that distance measures that use frequency warping perform better than those that do not, and minimal extra advantage is gained by using weighted distances or delta features.},
}
@INPROCEEDINGS{cslu:cslutoolkit,
  author        = {S. Sutton and R. Cole and J. de Villiers and J. Schalkwyk and P. Vermeulen and M. Macon and Y. Yan and E. Kaiser and B. Rundle and K. Shobaki and P. Hosom and A. Kain and J. Wouters and D. Massaro and M. Cohen},
  title         = {{Universal Speech Tools: the CSLU Toolkit}},
  booktitle     = {Proc. of International Conference on Spoken Language Processing},
  month         = {November},
  year          = {1998},
  note          = {\bibcsluurl},
}
@INCOLLECTION{cslu:german98,
  author        = {M. W. Macon and A. Kain and A. E. Cronk and H. Meyer and K. Mueller and B. Saeuberlich and A. W. Black},
  title         = {Rapid Prototyping of a German TTS System},
  booktitle     = {Tech. Rep. CSE-98-015},
  publisher     = {Department of Computer Science, Oregon Graduate Institute of Science and Technology},
  address       = {Portland, OR},
  month         = {September},
  year          = {1998},
  note          = {\bibcsluurl},
}
@INPROCEEDINGS{cslu:icassp98mm,
  author        = {M. W. Macon and A. McCree and W. M. Lai and V. Viswanathan},
  title         = {Efficient Analysis/Synthesis of Percussion Musical Instrument Sounds Using an All-Pole Model},
  booktitle     = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing},
  volume        = {6},
  publisher     = {Speech},
  month         = {May},
  year          = {1998},
  pages         = {3589--3592},
  note          = {\bibcsluurl},
  abstract      = {It is well-known that an impulse-excited, all-pole filter is capable of representing many physical phenomena, including the oscillatory modes of percussion musical instruments like woodblocks, xylophones, or chimes. In contrast to the more common application of all-pole models to speech, however, practical problems arise in music synthesis due to the location of poles very close to the unit circle. The objective of this work was to develop algorithms to find excitation and filter parameters for synthesis of percussion instrument sounds using only an inexpensive all-pole filter chip (TI TSP50C1x). The paper describes analysis methods for dealing with pole locations near the unit circle, as well as a general method for modeling the transient attackcharacteristics of a particular sound while independently controlling the amplitudes of each oscillatory mode.},
}
@INPROCEEDINGS{cslu:icassp98kain,
  author        = {Alexander Kain and Michael W Macon},
  title         = {Spectral Voice Conversion for Text-to-Speech Synthesis},
  year          = {1998},
  booktitle     = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'98)},
  pages         = {285--288},
  note          = {\bibcsluurl},
  abstract      = {A new voice conversion algorithm that modifies a source speaker's speech to sound as if produced by a target speaker is presented. It is applied to a residual-excited LPC text-to-speech diphone synthesizer. Spectral parameters are mapped using a locally linear transformation based on Gaussian mixture models whose parameters are trained by joint density estimation. The LPC residuals are adjusted to match the target speaker's average pitch. To study effects of the amount of training on performance, data sets of varying sizes are created by automatically selecting subsets of all available diphones by a vector quantization method. In an objective evaluation, the proposed method is found to perform more reliably for small training sets than a previous approach. In perceptual tests, it was shown that nearly optimal spectral conversion performance was achieved, even with a small amount of training data. However, speech quality improved with an increase in training set size.},
}
@INCOLLECTION{cslu:ogireslpc97,
  author        = {M. W. Macon and A. E. Cronk and J. Wouters and A. Kain},
  title         = {OGIresLPC: Diphone synthesizer using residual-excited linear prediction},
  booktitle     = {Tech. Rep. CSE-97-007},
  publisher     = {Department of Computer Science, Oregon Graduate Institute of Science and Technology},
  month         = {September},
  year          = {1997},
  address       = {Portland, OR},
  note          = {\bibcsluurl},
}
@INPROCEEDINGS{cslu:aes97,
  author        = {M. W. Macon and L. Jensen-Link and J. Oliverio and M. Clements and E. B. George},
  title         = {Concatenation-based MIDI-to-singing voice synthesis},
  booktitle     = {103rd Meeting of the Audio Engineering Society},
  publisher     = {New York},
  year          = {1997},
  note          = {\bibcsluurl},
  abstract      = {In this paper, we propose a system for synthesizing the human singing voice and the musical subtleties that accompany it. The system, Lyricos, employs a concatenation-based text-to-speech method to synthesize arbitrary lyrics in a given language. Using information contained in a regular MIDI file, the system chooses units, represented as sinusoidal waveform model parameters, from an inventory of data collected from a professional singer, and concatenates these to form arbitrary lyrical phrases. Standard MIDI messages control parameters for the addition of vibrato, spectral tilt, and dynamic musical expression, resulting in a very natural-sounding singing voice.},
}
@INPROCEEDINGS{cslu:trsap97,
  author        = {M. W. Macon and M. A. Clements},
  title         = {Sinusoidal modeling and modification of unvoiced speech},
  booktitle     = {IEEE Transactions on Speech and Audio Processing},
  volume        = {5},
  month         = {November},
  year          = {1997},
  pages         = {557--560},
  number        = {6},
  note          = {\bibcsluurl},
  abstract      = {Although sinusoidal models have been shown to be useful for time-scale and pitch modification of voiced speech, objectionable artifacts often arise when such models are applied to unvoiced speech. This correspondence presents a sinusoidal model-based speech modification algorithm that preserves the natural character of unvoiced speech sounds after pitch and time-scale modification, eliminating commonly-encountered artifacts. This advance is accomplished via a perceptually-motivated modulation of the sinusoidal component phases that mitigates artifacts in the reconstructed signal after time-scale and pitch modification},
}
@INPROCEEDINGS{cslu:icassp97,
  author        = {Michael Macon and Leslie Jensen-Link and James Oliverio and Mark A. Clements and E. Bryan George},
  title         = {A Singing Voice Synthesis System Based on Sinusoidal Modeling},
  year          = {1997},
  booktitle     = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'97)},
  pages         = {435--438},
  note          = {\bibcsluurl},
  abstract      = {Although sinusoidal models have been demonstrated to be capable of high-quality musical instrument synthesis, speech modification, and speech synthesis, little exploration of the application of these models to the synthesis of singing voice has been undertaken. In this paper, we propose a system framework similar to that employed in concatenation-based text-to-speech synthesizers, and describe its extension to the synthesis of singing voice. The power and flexibility of the sinusoidal model used in the waveform synthesis portion of the system enables high-quality, computationally-effcient synthesis and the incorporation of musical qualities such as vibrato and spectral tilt variation. Modeling of segmental phonetic characteristics is achieved by employing a``unit selection'' procedure that selects sinusoidally-modeled segments from an inventory of singing voice data collected from ahuman vocalist. The system, called Lyricos, is capable of synthesizing very natural-sounding singing that maintains the characteristics and perceived identityof the analyzed vocalist.},
}
@INPROCEEDINGS{cslu:icassp96,
  address       = {Atlanta, USA},
  author        = {Michael W. Macon and Mark A. Clements},
  booktitle     = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'96)},
  title         = {{Speech Concatenation and Synthesis Using an Overlap--Add Sinusoidal   Model}},
  year          = {1996},
  volume        = {1},
  pages         = {361--364},
  note          = {\bibcsluurl},
  abstract      = {In this paper, an algorithm for the concatenation of speech signal segments taken from disjoint utterances is presented. The algorithm is based on the Analysis-by-Synthesis/Overlap-Add (ABS/OLA) sinusoidal model, which is capable of performing high quality pitch- and time-scale modification of both speech and music signals. With the incorporation of concatenation and smoothing techniques, the model is capable of smoothing the transitions between separately-analyzed speech segments by matching the time- and frequency-domain characteristics of the signals at their boundaries. The application of these techniques in a text-to-speech system based on concatenation of diphone sinusoidal models is also presented.},
}
@INPROCEEDINGS{cslu:jasa95,
  author        = {M. W. Macon and M. A. Clements},
  title         = {Speech synthesis based on an overlap-add sinusoidal model},
  booktitle     = {J. of the Acoustical Society of America},
  volume        = {97},
  publisher     = {Pt. 2},
  month         = {May},
  year          = {1995},
  pages         = {3246},
  number        = {5},
  note          = {\bibcsluurl},
}

This document was translated from LATEX by HEVEA.