@inproceedings{0af07a83c08f45d0b410f7f2f98fcc54,
title = "MOS PREDICTOR FOR SYNTHETIC SPEECH WITH I-VECTOR INPUTS",
abstract = "Based on deep learning technology, non-intrusive methods have received increasing attention for synthetic speech quality assessment since it does not need reference signals. Meanwhile, i-vector has been widely used in paralinguistic speech attribute recognition such as speaker and emotion recognition, but few studies have used it to estimate speech quality. In this paper, we propose a neural-network-based model that splices the deep features extracted by convolutional neural network (CNN) and i-vector on the time axis and uses Transformer encoder as time sequence model. To evaluate the proposed method, we improve the previous prediction models and conduct experiments on Voice Conversion Challenge (VCC) 2018 and 2016 dataset. Results show that i-vector contains information very related to the quality of synthetic speech and the proposed models that utilize i-vector and Transformer encoder highly increase the accuracy of MOSNet and MBNet on both utterance-level and system-level results.",
keywords = "Transformer encoder, i-vector, speech quality assessment, speech synthesis",
author = "Miao Liu and Jing Wang and Shicong Li and Fei Xiang and Yue Yao and Lidong Yang",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE; 47th IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2022 ; Conference date: 23-05-2022 Through 27-05-2022",
year = "2022",
doi = "10.1109/ICASSP43922.2022.9747533",
language = "English",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "906--910",
booktitle = "2022 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2022 - Proceedings",
address = "United States",
}