@inproceedings{f823eceb723f40308e478ec8654860bf,
title = "Audio-Visual Speech Separation Using I-Vectors",
abstract = "Speech separation is the task of extracting target speech from background interference. In applications like home devices or office meeting, prior knowledge about possible speaker is available, which can be leveraged for speech separation. This paper proposes a novel audio-visual-speaker speech separation model that decomposes a monaural speech signal into two speech segments belonging to different speakers, by making use of audio-visual inputs and i-vector speaker embeddings. The proposed model is based on a BLSTM network to generate complex time-frequency masks which can be applied to the acoustic mixed-speech spectrogram. We train and evaluate our model on a speech separation task derived from the VoxCeleb2 dataset and show effectiveness of the method.",
keywords = "Audio-visual speech separation, Cocktail party problem, I-vectors, Speaker embeddings",
author = "Yiyu Luo and Jing Wang and Xinyao Wang and Liang Wen and Lizhong Wang",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 2nd IEEE International Conference on Information Communication and Signal Processing, ICICSP 2019 ; Conference date: 28-09-2019 Through 30-09-2019",
year = "2019",
month = sep,
doi = "10.1109/ICICSP48821.2019.8958547",
language = "English",
series = "2019 2nd IEEE International Conference on Information Communication and Signal Processing, ICICSP 2019",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "276--280",
booktitle = "2019 2nd IEEE International Conference on Information Communication and Signal Processing, ICICSP 2019",
address = "United States",
}