@inproceedings{4ebef036dc9149a8a8158522cf281e46,
title = "Multimodal depression detection using a deep feature fusion network",
abstract = "Currently, more and more people are suffering from depression with the increase of social pressure, which has become one of the most severe health issues worldwide. Therefore, timely diagonosis of depression is very important. In this paper, a deep feature fusion network is proposed for multimodal depression detection. Firstly, an unsupervised autoencoder based on transformer is applied to derive the sentence-level embedding for the frame-level audiovisual features; then a deep feature fusion network based on a cross-modal transformer is proposed to fuse the text, audio and video features. The experimental results show that the proposed method achieves superior performance compared to state-of-the-art methods on the English database DAIC-WOZ.",
keywords = "Depression detection, multimodal feature fusion, transformer, unsupervised learning",
author = "Guangyao Sun and Shenghui Zhao and Bochao Zou and Yubo An",
note = "Publisher Copyright: {\textcopyright} 2022 SPIE.; 3rd International Conference on Computer Science and Communication Technology, ICCSCT 2022 ; Conference date: 30-07-2022 Through 31-07-2022",
year = "2022",
doi = "10.1117/12.2662620",
language = "English",
series = "Proceedings of SPIE - The International Society for Optical Engineering",
publisher = "SPIE",
editor = "Yingfa Lu and Changbo Cheng",
booktitle = "Third International Conference on Computer Science and Communication Technology, ICCSCT 2022",
address = "United States",
}