@inproceedings{1f6b4fd0d80649abbe175457fb5eedb7,
title = "A Modified Speaking Rate Estimation Based on Frame-Level LSTM",
abstract = "Speaking rate has various applications in many domains such as speech recognition, speaker verification, emotion recognition, etc. It conveys long-term information in speech and changes over time which can be seen as a kind of time sequence. This paper proposes a frame-level LSTM speaking rate estimation method. Instead of taking the whole utterance as a sequence, the frame-level LSTM exploits the sequence information in each segment and brings a more precise segmented speaking rate estimation. We also evaluate the influence of fixed-length segmentation and voice activity detection(vad) segmentation on speaking rate estimation. Results show that the proposed frame-level LSTM method yields a high correlation between the estimated speaking rate and the ground truth. It achieves a relative improvement of 13.0% compared to the state of the art statistical learning method and 16.3% over the support vector regression(SVR) evaluated on the same TIMIT corpus.",
keywords = "Frame-level LSTM, Segmentation, Speaking rate estimation",
author = "Yanhong Xiao and Shixuan Du and Xiang Xie and Jing Wang and Qingran Zhan",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 14th IEEE International Conference on Signal Processing, ICSP 2018 ; Conference date: 12-08-2018 Through 16-08-2018",
year = "2019",
month = feb,
day = "2",
doi = "10.1109/ICSP.2018.8652347",
language = "English",
series = "International Conference on Signal Processing Proceedings, ICSP",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "600--603",
editor = "Yuan Baozong and Ruan Qiuqi and Zhao Yao and An Gaoyun",
booktitle = "2018 14th IEEE International Conference on Signal Processing Proceedings, ICSP 2018",
address = "United States",
}