Research

2015-2017 Postdoctoral position, Machine Intelligence Lab, University of Cambridge, UK
High-Quality voice model for STatistical parametric speech Synthesis

  • Parametric speech synthesis [1][2][3][4]

2014-2015 Postdoctoral position, Ircam, CNRS-UPMC, Paris, France
ChaNTeR National project

  • Singing voice synthesis [5][6][7]

2011-2013 Postdoctoral position, University of Crete and FORTH, Heraklion, Greece
High-Resolution and Multi-Frame Speech Transformation

  • Adaptive Harmonic model (aHM) [8, 9, 10, 11]
  • Voice source modeling: Phase Distortion Deviation (PDD), HMPD vocoder [12, 13, 14, 15]

2006-2010 Ph.D. studies, Ircam, Paris, France
Glottal source and vocal-tract filter separation [16]

  • Estimation of shape parameters of glottal models by phase minimization [17, 18, 19, 20, 21]
  • Voice transformation/synthesis, SVLN vocoder [22, 23, 24]
  • Glottal Closure Instant detection [25]
  • High Speed Videoendoscopy [26]. See the two Ircam_USC databases available for download.

2002-2003 Undergraduate, UniNE, Neuchâtel, Switzerland

  • Estimation of fundamental frequency for music instruments in mono and polyphonic context (See FMIT)

[1] [pdf] G. Degottex and M. Gales, “A Spectrally Weighted Mixture of Least Square Error and Wasserstein Discriminator Loss for Generative SPSS,” in Proc. Workshop on Spoken Language Technology (SLT), Athens, Greece, 2018.
[Bibtex]
@inproceedings{DegottexG2018percivaltts,
author = {G. Degottex and Mark Gales},
booktitle = {Proc. Workshop on Spoken Language Technology (SLT)},
title = {A Spectrally Weighted Mixture of Least Square Error and Wasserstein Discriminator Loss for Generative SPSS},
address = {Athens, Greece},
month = {December},
year = {2018},
abstract = {Generative networks can create an artificial spectrum based on its conditional distribution estimate instead of predicting only the mean value, as the Least Square (LS) solution does. This is promising since the LS predictor is known to oversmooth features leading to muffling effects. However, modeling a whole distribution instead of a single mean value requires more data and thus also more computational resources. With only one hour of recording, as often used with LS approaches, the resulting spectrum is noisy and sounds full of artifacts. In this paper, we suggest a new loss function, by mixing the LS error and the loss of a discriminator trained with Wasserstein GAN, while weighting this mixture differently through the frequency domain. Using listening tests, we show that, using this mixed loss, the generated spectrum is smooth enough to obtain a decent perceived quality. While making our source code available online, we also hope to make generative networks more accessible with lower the necessary resources.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2018percivaltts.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2018percivaltts.pdf}
}
[2] [pdf] [doi] G. Degottex, P. Lanchantin, and M. Gales, “A Log Domain Pulse Model for Parametric Speech Synthesis,” IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 26, iss. 1, pp. 57-70, 2018.
[Bibtex]
@article{DegottexG2017pmlj,
author={G. Degottex and P. Lanchantin and M. Gales},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
title={A Log Domain Pulse Model for Parametric Speech Synthesis},
year={2018},
volume={26},
number={1},
pages={57-70},
abstract={Most of the degradation in current Statistical Parametric Speech Synthesis (SPSS) results from the form of the vocoder. One of the main causes of degradation is the reconstruction of the noise. In this article, a new signal model is proposed that leads to a simple synthesizer, without the need for ad-hoc tuning of model parameters. The model is not based on the traditional additive linear source-filter model, it adopts a combination of speech components that are additive in the log domain. Also, the same representation for voiced and unvoiced segments is used, rather than relying on binary voicing decisions. This avoids voicing error discontinuities that can occur in many current vocoders. A simple binary mask is used to denote the presence of noise in the time-frequency domain, which is less sensitive to classification errors. Four experiments have been carried out to evaluate this new model. The first experiment examines the noise reconstruction issue. Three listening tests have also been carried out that demonstrate the advantages of this model: comparison with the STRAIGHT vocoder; the direct prediction of the binary noise mask by using a mixed output configuration; and partial improvements of creakiness using a mask correction mechanism.},
doi={10.1109/TASLP.2017.2761546},
ISSN={2329-9290},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2017pmlj_acceptedversion.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2017pmlj_acceptedversion.pdf}
}
[3] [pdf] G. Degottex, P. Lanchantin, and M. Gales, “A Pulse Model in Log-domain for a Uniform Synthesizer,” in Proc. 9th Speech Synthesis Workshop (SSW9), Sunnyvale, CA, USA, 2016.
[Bibtex]
@inproceedings{DegottexG2016pml,
author = {G. Degottex and Pierre Lanchantin and Mark Gales},
booktitle = {Proc. 9th Speech Synthesis Workshop (SSW9)},
title = {A Pulse Model in Log-domain for a Uniform Synthesizer},
address = {Sunnyvale, CA, USA},
month = {September},
year = {2016},
abstract = {The quality of the vocoder plays a crucial role in the performance of parametric speech synthesis systems. In order to improve the vocoder quality, it is necessary to reconstruct as much of the perceived components of the speech signal as possible. In this paper, we first show that the noise component is currently not accurately modelled in the widely used STRAIGHT vocoder, thus, limiting the voice range that can be covered and also limiting the overall quality. In order to motivate a new, alternative, approach to this issue, we present a new synthesizer, which uses a uniform representation for voiced and unvoiced segments. This synthesizer has also the advantage of using a simple signal model compared to other approaches, thus offering a convenient and controlled alternative for future developments. Experiments analysing the synthesis quality of the noise component shows improved speech reconstruction using the suggested synthesizer compared to STRAIGHT. Additionally an experiment about analysis/resynthesis shows that the suggested synthesizer solves some of the issues of another uniform vocoder, Harmonic Model plus Phase Distortion (HMPD). In text-to-speech synthesis, it outperforms HMPD and exhibits a similar, or only slightly worse, quality to STRAIGHT's quality, which is encouraging for a new vocoding approach.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2016pml.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2016pml.pdf}
}
[4] [pdf] G. Degottex, P. Lanchantin, and M. Gales, “Light Supervised Data Selection, Voice Quality Normalized Training and Log Domain Pulse Synthesis,” in in Proc. Blizzard Challenge 2017 – EH1, Stockholm, Sweden, 2017.
[Bibtex]
@inproceedings{DegottexG2017bliz,
author = {G. Degottex and P. Lanchantin and M. Gales},
title = {Light Supervised Data Selection, Voice Quality Normalized Training and Log Domain Pulse Synthesis},
booktitle = {in Proc. Blizzard Challenge 2017 - EH1},
address = {Stockholm, Sweden},
year = {2017},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2017bliz.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2017bliz.pdf}
}
[5] [pdf] L. Ardaillon, G. Degottex, and A. Roebel, “A multi-layer F0 model for singing voice synthesis using a B-spline representation with intuitive controls,” in Proc. Interspeech, Dresden, Germany, 2015.
[Bibtex]
@inproceedings{ArdaillonL2015f0model,
author = {L. Ardaillon and G. Degottex and A. Roebel},
booktitle = {Proc. Interspeech},
title = {A multi-layer F0 model for singing voice synthesis using a B-spline representation with intuitive controls},
address = {Dresden, Germany},
month = {September},
year = {2015},
abstract = {In singing voice, the fundamental frequency (F0) carries not only melody, but also music style, personal expressivity and other characteristics specific to voice production mechanism. The F0 modeling is therefore critical for a natural-sounding and expressive synthesis. In addition, for artistic purposes, composers also need to have control over expressive parameters of the F0 curve, which is missing in many current approaches. This paper presents a novel parametric F0 model for singing voice synthesis with intuitive control of expressive parameters. The proposed approach considers the various F0 variations of the singing voice as separate layers using B-splines to model the melodic component. This model has been implemented in a concatenative singing voice synthesis system and its perceived naturalness has been evaluated through listening tests. The validity of each layer is first evaluated independently, and the full model is then compared to real F0 curves from professional singers. The results of these tests suggest that the model is suitable to produce natural and expressive F0 contours.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/ArdaillonL2015f0model.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/ArdaillonL2015f0model.pdf}
}
[6] [pdf] G. Degottex, L. Ardaillon, and A. Roebel, “Simple Multi Frame Analysis methods for estimation of Amplitude Spectral Envelope estimation in Singing Voice,” in Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Shanghai, China, 2016.
[Bibtex]
@inproceedings{DegottexG2016mfasings,
author = {G. Degottex and L. Ardaillon and A. Roebel},
booktitle = {Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title = {Simple Multi Frame Analysis methods for estimation of Amplitude Spectral Envelope estimation in Singing Voice},
address = {Shanghai, China},
month = {March},
year = {2016},
abstract = {In the state of the art, a single frame of DFT transform is commonly used as a basis for building amplitude spectral envelopes. Multiple Frame Analysis (MFA) has already been suggested for envelope estimation, but often with excessive complexity. In this paper, two MFA-based methods are presented: one simplifying an existing Least Square (LS) solution, and another one based on a simple linear interpolation. In the context of singing voice we study sustained segments with vibrato, because these ones are obviously critical for singing voice synthesis. They also provide a convenient context to study, prior to extension of this work in more general contexts. Numerical and perceptual experiments show clear improvements of the two methods described compared to the state of the art and encourage further studies in this research direction.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2016mfasings.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2016mfasings.pdf}
}
[7] [pdf] [doi] G. Degottex, L. Ardaillon, and A. Roebel, “Multi-Frame Amplitude Envelope Estimation for Modification of Singing Voice,” IEEE Transactions on Audio, Speech, and Language Processing, Accepted 2016.
[Bibtex]
@article{DegottexG2016mfasingsj,
author={G. Degottex and L. Ardaillon and A. Roebel},
journal={IEEE Transactions on Audio, Speech, and Language Processing},
title={Multi-Frame Amplitude Envelope Estimation for Modification of Singing Voice},
year={Accepted 2016},
abstract={Singing voice synthesis benefits from very high quality estimation of the resonances and anti-resonances of the Vocal Tract Filter (VTF), i.e. an amplitude spectral envelope. In the state of the art, a single frame of DFT transform is commonly used as a basis for building spectral envelopes. Even though Multiple Frame Analysis (MFA) has already been suggested for envelope estimation, it is not yet used in concrete applications. Indeed, even though existing attempts have shown very interesting results, we will demonstrate that they are either over complicated or fail to satisfy the high accuracy that is necessary for singing voice. In order to allow future applications of MFA, this article aims to improve the theoretical understanding and advantages of MFA-based methods. The use of singing voice signals is very beneficial for studying MFA methods due to the fact that the VTF configuration can be relatively stable and, at the same time, the vibrato creates a regular variation that is easy to model. By simplifying and extending previous works, we also suggest and describe two MFA-based methods. To better understand the behaviors of the envelope estimates, we designed numerical measurements to assess SFA and MFA methods using synthetic signals. With listening tests, we also designed two proofs of concept using pitch scaling and conversion of timbre. Both evaluations show clear and positive results for MFA-based methods, thus, encouraging this research direction for future applications.},
doi={10.1109/TASLP.2016.2551863},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2016mfasingsj_acceptedversion.pdf}
}
[8] [pdf] [doi] G. Degottex and Y. Stylianou, “Analysis and Synthesis of Speech using an Adaptive Full-band Harmonic Model,” IEEE Transactions on Acoustics, Speech and Language Processing, vol. 21, iss. 10, pp. 2085-2095, 2013.
[Bibtex]
@article{DegottexG2013jahmair,
author = {G. Degottex and Y. Stylianou},
title = {Analysis and Synthesis of Speech using an Adaptive Full-band Harmonic Model},
journal={IEEE Transactions on Acoustics, Speech and Language Processing},
abstract = {Voice models often use frequency limits to split the speech spectrum into two or more voiced/unvoiced frequency bands. However, from the voice production, the amplitude spectrum of the voiced source decreases smoothly without any abrupt frequency limit. Accordingly, multiband models struggle to estimate these limits and, as a consequence, artifacts can degrade the perceived quality. Using a linear frequency basis adapted to the non-stationarities of the speech signal, the Fan Chirp Transformation (FChT) have demonstrated harmonicity at frequencies higher than usually observed from the DFT which motivates a full-band modeling. The previously proposed Adaptive Quasi-Harmonic model (aQHM) offers even more flexibility than the FChT by using a non-linear frequency basis. In the current paper, exploiting the properties of aQHM, we describe a full-band Adaptive Harmonic Model (aHM) along with detailed descriptions of its corresponding algorithms for the estimation of harmonics up to the Nyquist frequency. Formal listening tests show that the speech reconstructed using aHM is nearly indistinguishable from the original speech. Experiments with synthetic signals also show that the proposed aHM globally outperforms previous sinusoidal and harmonic models in terms of precision in estimating the sinusoidal parameters. As a perspective, such a precision is interesting for building higher level models upon the sinusoidal parameters, like spectral envelopes for speech synthesis.},
issn = {1558-7916},
number = {10},
pages = {2085--2095},
volume = {21},
year = {2013},
url={http://doi.org/10.1109/TASL.2013.2266772},
doi={10.1109/TASL.2013.2266772},
pdf={http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2013jahmair_acceptedversion.pdf}
}
[9] [pdf] G. Kafentzis, G. Degottex, O. Rosec, and Y. Stylianou, “Time-Scale Modifications Based on a Full-Band Adaptive Harmonic Model,” in Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Vancouver, Canada, 2013.
[Bibtex]
@inproceedings{KafentzisGP2013timescaleahm,
author = {G. Kafentzis and G. Degottex and O. Rosec and Y. Stylianou},
booktitle = {Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title = {Time-Scale Modifications Based on a Full-Band Adaptive Harmonic Model},
address = {Vancouver, Canada},
month = {August},
year = {2013},
abstract = {In this paper, a simple method for time-scale modifications of speech based on a recently suggested model for AM-FM decomposition of speech signals, is presented. This model is referred to as the adaptive Harmonic Model (aHM). A full-band speech analysis/synthesis system based on the aHM representation is built, without the necessity of seperating a deterministic and/or a stochastic component from the speech signal. The aHM models speech as a sum of harmonically related sinusoids that can adapt to the local characteristics of the signal and provide accurate instantaneous amplitude, frequency, and phase trajectories. Because of the high quality representation and reconstruction of speech, aHM can provide high quality time-scale modifications. Informal listenings show that the synthetic time-scaled waveforms are natural and free of some common artifacts encountered in other state-of-the-art models, such as "metallic quality", chorusing, or musical noise.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/KafentzisGP2013timescaleahm.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/KafentzisGP2013timescaleahm.pdf}
}
[10] [pdf] G. Kafentzis, G. Degottex, O. Rosec, and Y. Stylianou, “Pitch Modifications of speech based on an Adaptive Harmonic Model,” in Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Florence, Italy, 2014.
[Bibtex]
@inproceedings{KafentzisGP2014pitchshiftahm,
author = {G. Kafentzis and G. Degottex and O. Rosec and Y. Stylianou},
booktitle = {Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title = {Pitch Modifications of speech based on an Adaptive Harmonic Model},
address = {Florence, Italy},
month = {May},
year = {2014},
abstract = {In this paper, a simple method for pitch-scale modifications of speech based on a recently suggested model for AM-FM decomposition of speech signals, is presented. This model is referred to as the adaptive Harmonic Model (aHM). The aHM models speech as a sum of harmonically related sinusoids that can adapt to the local characteristics of the signal. It was shown that this model provides high quality reconstruction of speech and thus, it can also provide high quality pitch-scale modifications. For the latter, the amplitude envelope is estimated using the Discrete All-Pole (DAP) method, and the phase envelope estimation is performed by utilizing the concept of relative phase. Formal listening tests on a database of several languages show that the synthetic pitch-scaled waveforms are natural and free of some common artefacts encountered in other state-of-the-art models, such as HNM and STRAIGHT.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/KafentzisGP2014pitchshiftahm.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/KafentzisGP2014pitchshiftahm.pdf}
}
[11] [pdf] V. Morfi, G. Degottex, and A. Mouchtaris, “A Computationally Efficient Refinement of the Fundamental Frequency Estimate for the Adaptive Harmonic Model,” in Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Florence, Italy, 2014.
[Bibtex]
@inproceedings{MorfiV2014ppadft,
author = {V. Morfi and G. Degottex and A. Mouchtaris},
booktitle = {Proc. IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title = {A Computationally Efficient Refinement of the Fundamental Frequency Estimate for the Adaptive Harmonic Model},
address = {Florence, Italy},
month = {May},
year = {2014},
abstract = {The full-band Adaptive Harmonic Model (aHM) can be used by the Adaptive Iterative Refinement (AIR) algorithm to accurately model the perceived characteristics of a speech recording. However, the Least Squares (LS) solution used in the current aHM-AIR makes the $f_{0}$ refinement time consuming, limiting the use of this algorithm for large databases. In this paper, a Peak Picking (PP) approach is suggested as a substitution to the LS solution. In order to integrate the adaptivity scheme of aHM in the PP approach, an adaptive Discrete Fourier Transform (aDFT) is also suggested in this paper, whose frequency basis can fully follow the frequency variations of the $f_{0}$ curve. Evaluations have shown an average time reduction of 5.5 times compared to the LS solution approach, while the quality of the re-synthesis is preserved compared to the original aHM-AIR.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/MorfiV2014ppadft.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/MorfiV2014ppadft.pdf}
}
[12] [pdf] G. Degottex and N. Obin, “Phase Distortion Statistics as a Representation of the Glottal Source: Application to the Classification of Voice Qualities,” in Proc. Interspeech, Singapore, 2014.
[Bibtex]
@inproceedings{DegottexG2014pddvqclass,
author = {G. Degottex and N. Obin},
title = {Phase Distortion Statistics as a Representation of the Glottal Source: Application to the Classification of Voice Qualities},
booktitle = {Proc. Interspeech},
address = {Singapore},
month = {September},
year = {2014},
organization = {International Speech Communication Association (ISCA)},
abstract = {The representation of the glottal source is of paramount importance
for describing para-linguistic information carried through
the voice quality (e.g., emotions, mood, attitude). However,
some existing representations of the glottal source are based
on analytical glottal models, which assume strong a priori constraints
on the shape of the glottal pulses. Thus, these representations
are restricted to limited number of voices. Recent
progresses in the estimation of the glottal models revealed that
the Phase Distortion (PD) of the signal carries most of the information
about the glottal pulses. This paper introduces a flexible
representation of the glottal source - based on the short-term
modelling of the phase distortion. This representation is not
constrained by a specific analytical model, and thus can be used
to describe a larger variety of expressive voices. We address the
efficiency of this representation for the recognition of various
voice qualities, with comparison to MFCC and standard glottal
source representations.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2014pddvqclass.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2014pddvqclass.pdf}
}
[13] [pdf] M. Koutsogiannaki, O. Simantiraki, G. Degottex, and Y. Stylianou, “The Importance of Phase on Voice Quality Assessment,” in Proc. Interspeech, Singapore, 2014.
[Bibtex]
@inproceedings{KoutsogiannakiM2014pddvpath,
author = {M. Koutsogiannaki and O. Simantiraki and G. Degottex and Y. Stylianou},
title = {The Importance of Phase on Voice Quality Assessment},
booktitle = {Proc. Interspeech},
address = {Singapore},
month = {September},
year = {2014},
organization = {International Speech Communication Association (ISCA)},
abstract = {State of the art objective measures for quantifying voice quality
mostly consider estimation of features extracted from the magnitude
spectrum. Assuming that speech is obtained by exciting
a minimum-phase (vocal tract filter) and a maximum-phase
component (glottal source), the amplitude spectrum cannot capture
the maximum phase characteristics. Since voice quality is
connected to the glottal source, the extracted features should
be linked with the maximum-phase component of speech. This
work proposes a new metric based on the phase spectrum for
characterizing the maximum-phase component of the glottal
source. The proposed feature, the Phase Distortion Deviation,
reveals the irregularities of the glottal pulses and therefore, can
be used for detecting voice disorders. This is evaluated in a
ranking problem of speakers with spasmodic dysphonia. Results
show that the obtained ranking is highly correlated with
the subjective ranking provided by doctors in terms of overall
severity, tremor and jitter. The high correlation of the suggested
feature with different metrics reveals its ability to capture voice
irregularities and highlights the importance of the phase spectrum
in voice quality assessment.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/KoutsogiannakiM2014pddvpath.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/KoutsogiannakiM2014pddvpath.pdf}
}
[14] [pdf] [doi] G. Degottex and D. Erro, “A uniform phase representation for the harmonic model in speech synthesis applications,” EURASIP, Journal on Audio, Speech, and Music Processing – Special Issue: Models of Speech – In Search of Better Representations, vol. 2014, iss. 1, p. 38, 2014.
[Bibtex]
@article{DegottexG2014jhmpd,
author = {G. Degottex and D. Erro},
title = {A uniform phase representation for the harmonic model in speech synthesis applications},
journal={EURASIP, Journal on Audio, Speech, and Music Processing - Special Issue: Models of Speech - In Search of Better Representations},
volume = {2014},
number = {1},
pages = {38},
year = {2014},
url={http://doi.org/10.1186/s13636-014-0038-1},
doi={10.1186/s13636-014-0038-1},
abstract = {Feature-based vocoders, e.g. STRAIGHT, offer a way to manipulate the perceived characteristics of the speech signal in speech transformation and synthesis. For the harmonic model, which provide excellent perceived quality, features for the amplitude parameters already exist (e.g. LSF, MFCC). However, because of the wrapping of the phase parameters, phase features are more difficult to design. To randomize the phase of the harmonic model during synthesis, a voicing feature is commonly used, which distinguishes voiced and unvoiced segments. However, voice production allows smooth transitions between voiced/unvoiced states which makes voicing segmentation sometimes tricky to estimate. In this article, two phase features are suggested to represent the phase of the harmonic model in a uniform way, without voicing decision. The synthesis quality of the resulting vocoder has been evaluated, using subjective listening tests, in the context of resynthesis, pitch scaling and HMM-based synthesis. The experiments show that the suggested signal model is comparable to STRAIGHT or even better in some scenarios. They also reveal some limitations of the harmonic framework itself in case of high fundamental frequencies.},
pdf={http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2014jhmpd_acceptedversion.pdf}
}
[15] [pdf] G. Degottex and D. Erro, “A Measure of Phase Randomness for the Harmonic Model in Speech Synthesis,” in Proc. Interspeech, Singapore, 2014.
[Bibtex]
@inproceedings{DegottexG2014hmpdis,
author = {G. Degottex and D. Erro},
title = {A Measure of Phase Randomness for the Harmonic Model in Speech Synthesis},
booktitle = {Proc. Interspeech},
address = {Singapore},
month = {September},
year = {2014},
organization = {International Speech Communication Association (ISCA)},
abstract = {Modern statistical speech processing frameworks require the speech signals to be translated into feature vectors by means of vocoders. While features representing the amplitude envelope already exist (e.g. MFCC, LSF), parametrizing the phase information is far from straightforward, not only because it is a circular data, but also because it shows an irregular behaviour in noisy time-frequency regions. Thus, many vocoders reconstruct speech by using minimum phases and random phases, relying on a previous voicing decision. In this paper, a phase feature is suggested to represent the randomness of the phase across the full time-frequency plan, in both voiced and unvoiced segments, without voicing decision. Resynthesis experiments show that, when integrated into a full-band harmonic vocoder, the suggested randomization feature is slightly better, on average, to STRAIGHT's aperiodicity. In HMM-based synthesis, the results show that the suggested vocoder reduces the complexity of the analysis and statistical modelling by removing the voicing decision, while keeping the perceived quality.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2014hmpdis.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2014hmpdis.pdf}
}
[16] [pdf] G. Degottex, “Glottal source and vocal tract separation,” PhD Thesis, Paris, France, 2010.
[Bibtex]
@phdthesis{Degottex2010,
author = {G. Degottex},
title = {Glottal source and vocal tract separation},
school = {UPMC-Ircam-UMR9912-STMS, Paris, France},
address = {Paris, France},
year = 2010,
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2010_PhD_v4_Final.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2010_PhD_v4_Final.pdf},
abstract = {This study addresses the problem of inverting a voice production model to retrieve, for a given recording, a representation of the sound source which is generated at the glottis level, the glottal source, and a representation of the resonances and anti-resonances of the vocal-tract.
This separation gives the possibility to manipulate independently the elements composing the voice.
There are many applications of this subject like the ones presented in this study, namely voice transformation and speech synthesis, as well as many others such as identity conversion, expressivity synthesis, voice restoration which can be used in entertainment technologies, artistic sound installations, movies and music industry, toys and video games, telecommunication, etc.
In this study, we assume that the perceived elements of the voice can be manipulated using the well known source-filter model. In the spectral domain, voice production is thus described as a multiplication of the spectra of its elements, the glottal source, the vocal-tract filter and the radiation.
The second assumption used in this study concerns the deterministic component of the glottal source. Indeed, we assume that a glottal model can fit one period of the glottal source. Using such an analytical description, the amplitude and phase spectra of the deterministic source are linked through the shape parameter of the glottal model. Regarding the state of the art of voice transformation and speech synthesis methods, the naturalness and the control of the transformed and synthesized voices should be improved.
Accordingly, we try to answer the three following questions: 1) How to estimate the parameter of a glottal model? 2) How to estimate the vocal-tract filter according to this glottal model? 3) How to transform and synthesize a voiced signal using this glottal model?
Special attention is given to the first question. We first assume that the glottal source and the impulse response of the vocal-tract filter are mixed-phase and minimum-phase signals respectively.
Then, based on these properties, various methods are proposed which minimize the mean squared phase of the convolutive residual of an observed spectrum and its model.
A last method is described where a unique shape parameter is in a quasi closed-form expression of the observed spectrum.
Additionally, this study discusses the conditions a glottal model and its parametrization have to satisfy in order to ensure that the parameters estimation is reliable using the proposed methods.
These methods are also evaluated and compared to state of the art methods using synthetic and electroglottographic signals.
Using one of the proposed methods, the estimation of the shape parameter is independent of the position and the amplitude of the glottal model.
Moreover, it is shown that this same method outperforms all the compared methods.
To answer the second and third questions addressed in this study, we propose an analysis/synthesis procedure which estimates the vocal-tract filter according to an observed spectrum and its estimated source.
Preference tests have been carried out and their results are presented in this study to compare the proposed procedure to existing ones.
In terms of pitch transposition, it is shown that the overall quality of the voiced segments of a recording can be improved for important transposition factors. It is also shown that the breathiness of a voice can be controlled.
}
}
[17] [pdf] [doi] G. Degottex, A. Roebel, and X. Rodet, “Phase minimization for glottal model estimation,” IEEE Transactions on Acoustics, Speech and Language Processing, vol. 19, iss. 5, pp. 1080-1090, 2011.
[Bibtex]
@article{DegottexG2011msp,
author={G. Degottex and A. Roebel and X. Rodet},
title={Phase minimization for glottal model estimation},
journal={IEEE Transactions on Acoustics, Speech and Language Processing},
publisher = {IEEE},
year={2011},
volume={19},
number={5},
pages={1080-1090},
month={July},
abstract = {In glottal source analysis, the phase minimization criterion has already been proposed to detect excitation instants. As shown in this article, this criterion can also be used to estimate the shape parameter of a glottal model (ex. Liljencrants-Fant model) and not only its time position. Additionally, we show that the shape parameter can be estimated independently of the glottal model position. The reliability of the proposed methods is evaluated with synthetic signals and compared to that of the IAIF and minimum/maximum-phase decomposition methods. The results of the methods are evaluated according to the influence of the fundamental frequency and noise. The estimation of a glottal model is useful for the separation of the glottal source and the vocal-tract filter and therefore can be applied in voice transformation, synthesis and also in clinical context or for the study of the voice production.},
url={http://doi.org/10.1109/TASL.2010.2076806},
doi={10.1109/TASL.2010.2076806},
pdf={http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2011msp_acceptedversion.pdf}
}
[18] [pdf] G. Degottex, A. Roebel, and X. Rodet, “Joint estimate of shape and time-synchronization of a glottal source model by phase flatness,” in Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Dallas, USA, 2010, pp. 5058-5061.
[Bibtex]
@inproceedings{Degottex2010a,
author = {G. Degottex and A. Roebel and X. Rodet},
title = {Joint estimate of shape and time-synchronization of a glottal source model by phase flatness},
booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
address = {Dallas, USA},
pages = {5058-5061},
year = {2010},
abstract = {A new method is proposed to jointly estimate the shape parameter of a glottal model and its time position in a voiced segment. We show that, the idea of phase flatness used in the most robust Glottal Closure Instant detection methods can be generalized to estimate the shape of the glottal model. In this paper we validate the proposed method using synthetic signals. The robustness related to fundamental frequency and noise is evaluated. The estimation of the glottal source is useful for voice analysis (ex. separation of glottal source and vocal-tract filter), voice transformation and synthesis.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2010a.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2010a.pdf}
}
[19] [pdf] [doi] G. Degottex, A. Roebel, and X. Rodet, “Function of phase-distortion for glottal model estimation,” in Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Prague, Czeck Republic, 2011, pp. 4608-4611.
[Bibtex]
@inproceedings{Degottex2011a,
author={G. Degottex and A. Roebel and X. Rodet},
title={Function of phase-distortion for glottal model estimation},
booktitle={Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages={4608-4611},
year={2011},
address={Prague, Czeck Republic},
month={May},
keywords={Voice analysis , glottal model , glottal source , phase minimization , shape parameter},
abstract={In voice analysis, the parameters estimation of a glottal model, an analytic description of the deterministic component of the glottal source, is a challenging question to assess voice quality in clinical use or to model voice production for speech transformation and synthesis using a priori constraints. In this paper, we first describe the Function of Phase-Distortion (FPD) which allows to characterize the shape of the periodic pulses of the glottal source independently of other features of the glottal source. Then, using the FPD, we describe two methods to estimate a shape parameter of the Liljencrants-Fant glottal model. By comparison with state of the art methods using Electro-Glotto-Graphic signals, we show that the one of these method outperform the compared methods.},
doi={10.1109/ICASSP.2011.5947381},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2011a.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2011a.pdf}
}
[20] [pdf] M. Tahon, G. Degottex, and L. Devillers, “Usual voice quality features and glottal features for emotional valence detection,” in Proc. International Conference on Speech Prosody, Shanghai, China, 2012.
[Bibtex]
@inproceedings{TahonM2012a,
title = {Usual voice quality features and glottal features for emotional valence detection},
booktitle = {Proc. International Conference on Speech Prosody},
author = {M. Tahon and G. Degottex and L. Devillers},
address = {Shanghai, China},
month = {May},
year = {2012},
abstract = {We focus in this paper on the detection of emotions collected in real-life context. In order to improve our emotional valence detection system, we have tested new voice quality features that are mainly used for speech synthesis or voice transformation: the relaxation coefficient (Rd) and the functions of phase distortion (FPD); but also usual voice quality features. Distributions of voice quality features across speakers, gender, age and emotions are shown over the IDV-HR ecological corpus. Our results conclude that glottal and usual voice quality features are of interest for emotional valence detection even facing diverse kind of voices in ecological situations.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/TahonM2012a.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/TahonM2012a.pdf}
}
[21] [pdf] S. Huber, A. Roebel, and G. Degottex, “Glottal source shape parameter estimation using phase minimization variants,” in Proc. Interspeech, Portland, USA, 2012.
[Bibtex]
@inproceedings{HuberS2012mspd2ix,
author = {S. Huber and A. Roebel and G. Degottex},
title = {Glottal source shape parameter estimation using phase minimization variants},
booktitle = {Proc. Interspeech},
address = {Portland, USA},
month = {September},
year = {2012},
organization = {International Speech Communication Association (ISCA)},
abstract = {The voice quality of speech production is related to the vibrating modes of the vocal folds. The LF model provides an analytic description of the deterministic component of the glottal source. An parameterisation of this model is approximated by the shape parameter Rd, which mainly describes the transition in voice quality from tense to breathy voices. In this paper we first extent its defined range in order to be able to better describe breathy voice qualities. Then we propose a new method to estimate the Rd parameter. By evaluating a combination of error surfaces of different Rd parameter estimation methods and by objective measurement tests we verify the improvement by utilizing this new method compared to the state of the art baseline approach.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/HuberS2012mspd2ix.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/HuberS2012mspd2ix.pdf}
}
[22] [pdf] [doi] G. Degottex, P. Lanchantin, A. Roebel, and X. Rodet, “Mixed source model and its adapted vocal tract filter estimate for voice transformation and synthesis,” Speech Communication, vol. 55, iss. 2, pp. 278-294, 2013.
[Bibtex]
@article{DegottexG2013svln,
author={G. Degottex and P. Lanchantin and A. Roebel and X. Rodet},
title={Mixed source model and its adapted vocal tract filter estimate for voice transformation and synthesis},
journal={Speech Communication},
publisher = {Elsevier},
volume = {55},
number = {2},
pages = {278-294},
year={2013},
abstract = {In current methods for voice transformation and speech synthesis, the vocal-tract filter is usually assumed to be excited by a flat amplitude spectrum. In this article, we present a method using a mixed source model defined as a mixture of the Liljencrants-Fant (LF) model and Gaussian noise. Using the LF model, the base approach used in this presented work is therefore close to a vocoder using exogenous input like ARX-based methods or the Glottal Spectral Separation (GSS) method. Such approaches are therefore dedicated to voice processing promising an improved naturalness compared to generic signal models. Also, using spectral division like in GSS, we show that a glottal source model can be used in a more flexible way than in ARX approach. A vocal-tract filter estimate is therefore derived to take into account the amplitude spectra of both deterministic and random components of the glottal source. The proposed mixed source model is controlled by a small set of intuitive and independent parameters. The relevance of this voice production model is evaluated, through listening tests, in the context of resynthesis, HMM-based speech synthesis, breathiness modification and pitch transposition.},
url={http://doi.org/10.1016/j.specom.2012.08.010},
doi={10.1016/j.specom.2012.08.010},
issn = {0167-6393},
pdf={http://gillesdegottex.eu/wp-content/papercite-data/pdf/DegottexG2013svln_preprintR2_acceptedversion.pdf}
}
[23] [pdf] [doi] G. Degottex, A. Roebel, and X. Rodet, “Pitch transposition and breathiness modification using a glottal source model and its adapted vocal-tract filter,” in Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Prague, Czeck Republic, 2011, pp. 5128-5131.
[Bibtex]
@inproceedings{Degottex2011b,
author={G. Degottex and A. Roebel and X. Rodet},
title={Pitch transposition and breathiness modification using a glottal source model and its adapted vocal-tract filter},
booktitle={Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
pages={5128-5131},
year={2011},
address={Prague, Czeck Republic},
month={May},
abstract={The transformation of the voiced segments of a speech recording has many applications such as expressivity synthesis or voice conversion. This paper addresses the pitch transposition and the modification of breathiness by means of an analytic description of the deterministic component of the voice source, a glottal model. Whereas this model is dedicated to voice production, most of the current methods can be applied to any pseudo-periodic signals. Using the described method, the synthesized voice is thus expected to better preserve some naturalness compared to a more generic method. Using preference tests, it is shown that this method is preferred for important pitch transposition (e.g. one octave) compared to two state of the art methods. Additionally, it is shown that the breathiness of two male utterances can be controlled.},
doi={10.1109/ICASSP.2011.5947511 },
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2011b.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2011b.pdf}
}
[24] [pdf] P. Lanchantin, G. Degottex, and X. Rodet, “A HMM-based speech synthesis system using a new glottal source and vocal-tract separation method,” in Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), Dallas, USA, 2010, pp. 4630-4633.
[Bibtex]
@inproceedings{Lanchantin2010,
author = {P. Lanchantin and G. Degottex and X. Rodet},
title = {A {HMM}-based speech synthesis system using a new glottal source and vocal-tract separation method},
booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
address = {Dallas, USA},
pages = {4630-4633},
year = {2010},
abstract = {This paper introduces a HMM-based speech synthesis system which uses a new method for the Separation of Vocal-tract and Liljencrants-Fant model plus Noise (SVLN). The glottal source is separated into two components: a deterministic glottal waveform Liljencrants-Fant model and a modulated Gaussian noise. This glottal source is first estimated and then used in the vocal-tract estimation procedure. Then, the parameters of the source and the vocal-tract are included into HMM contextual models of phonems. SVLN is promising for voice transformation in synthesis of expressive speech since it allows an independent control of vocal-tract and glottal-source properties. The synthesis results are finally discussed and subjectively evaluated.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Lanchantin2010.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Lanchantin2010.pdf}
}
[25] [pdf] G. Degottex, A. Roebel, and X. Rodet, “Glottal Closure Instant detection from a glottal shape estimate,” in 13th International Conference on Speech and Computer (SPECOM), St-Petersburg, Russia, 2009, pp. 226-231.
[Bibtex]
@inproceedings{Degottex2009b,
author = {G. Degottex and A. Roebel and X. Rodet},
title = {Glottal Closure Instant detection from a glottal shape estimate},
booktitle = {13th International Conference on Speech and Computer (SPECOM)},
pages = {226--231},
year = 2009,
address = {St-Petersburg, Russia},
abstract = {The GCI detection is a common problem in voice analysis used for voice transformation and synthesis. The proposed innovative idea is to use a glottal shape estimate and a standard lips radiation model instead of the common pre-emphasis when computing the vocal-tract filter estimate. The time-derivative glottal source is then computed from the division in frequency of the speech spectrum by the vocal-tract filter. Indeed, prominent peaks are easy to locate in the time-derivative glottal source. A whole process recovering all GCIs in a speech segment is therefore proposed taking advantage of this. The GCI estimator is finally evaluated with synthetic signals and Electro-Glotto-Graphic signals.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2009b.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2009b.pdf}
}
[26] [pdf] G. Degottex, E. Bianco, and X. Rodet, “Usual to particular phonatory situations studied with high-speed videoendoscopy,” in The 6th International Conference on Voice Physiology and Biomechanics, ICVPB, Tempere, Finland, 2008, pp. 19-26.
[Bibtex]
@inproceedings{Degottex2008a,
author = {G. Degottex and E. Bianco and X. Rodet},
title = {Usual to particular phonatory situations studied with high-speed videoendoscopy},
booktitle = {The 6th International Conference on Voice Physiology and Biomechanics, ICVPB},
pages = {19-26},
year = 2008,
address = {Tempere, Finland},
month = {August},
abstract = {Current high-speed videoendoscopy (HSV) make it possible to obtain 4000 images of the larynx per second. By this process, the analysis of the vocal folds can provide significant information. This is also possible to estimate the area of the glottis. All this information is useful for the study of the various phonatory modes, but also for glottal flow estimation which allows the improvement of our acoustic understanding of speech signals. For the usual modes then for other particular phonatory situations, we present a comparison of various speech signals: acoustic, Electro-Glotto-Graphic, glottal area, and estimation of the glottal flow by inversion of the vocal tract.},
url = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2008a.pdf},
pdf = {http://gillesdegottex.eu/wp-content/papercite-data/pdf/Degottex2008a.pdf}
}