@inproceedings{ce8496a630934e47bf654602d0cc4321,
title = "A Chinese Speech Recognition System Based on Fusion Network Structure",
abstract = "The purpose of an automatic speech recognition system is to convert speech into recognizable text. Chinese is a language in which the same pronunciation but different writing means different meanings. At present, there are relatively few researches on Chinese speech recognition. Therefore, we propose a Chinese automatic speech recognition system based on the fusion network RRAINet and End-to-End structure acoustic model + language model. We treat the speech signal as a visual problem, and use the Mel spectrum and SpecAugment methods to preprocess the data. The model is trained by connected time series classification criteria and decoded based on a greedy algorithm, which can convert speech signals into Chinese characters. Experiments show that the model phoneme error rate is 12.56% and 12.38% on the dev set and the test set of Free ST(ST-CMDS-20170001_1-OS). The model word error rates are 18.79% and 18.74%, which are about 5% lower than the baseline VGG-CTC model.",
keywords = "CTC, data preprocessing, Fusion structure, Markov language model, speech recognition",
author = "Lunvi Guo and Shining Mu and Chaofan Shi and Bo Yan and Zhouling Xiao and Sheng Yu",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE.; 21st IEEE International Conference on Communication Technology, ICCT 2021 ; Conference date: 13-10-2021 Through 16-10-2021",
year = "2021",
doi = "10.1109/ICCT52962.2021.9657881",
language = "English",
series = "International Conference on Communication Technology Proceedings, ICCT",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1271--1276",
booktitle = "2021 IEEE 21st International Conference on Communication Technology, ICCT 2021",
address = "United States",
}