@inproceedings{e2d80a57431e49a5ad1fb7f776e2eda8,
title = "Ada-SwinBERT: Adaptive Token Selection for Efficient Video Captioning with Online Self-Distillation",
abstract = "Video captioning aims at producing textual descriptions for the given video. Benefiting from the self-attention mechanism for capturing long-distance dependencies between video patches and language sentences, the fully Transformer-based models achieve promising performance recently. However, due to continuous temporal information, there exists a large amount of redundant and unimportant visual content. Indiscriminate use of video patches results in expensive computation and inefficient use of resources. To tackle this issue, we propose Ada-SwinBERT, a novel approach that adaptively selects salient video tokens to achieve a balance between efficiency and performance for video captioning. Moreover, we devise a training strategy with online self-distillation to make up for the information loss caused by discarding video tokens. Video-text alignment knowledge distilled from the teacher leads to a robust training process. By pruning 78.1% input tokens hierarchically, our approach greatly reduces 62.0% FLOPs compared with the base model while achieving competitive performance with SOTA methods.",
keywords = "efficient multimodal transformer, self-distillation, token pruning, video captioning",
author = "Qianwen Cao and Heyan Huang and Minpeng Liao and Xianling Mao",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 2023 IEEE International Conference on Multimedia and Expo, ICME 2023 ; Conference date: 10-07-2023 Through 14-07-2023",
year = "2023",
doi = "10.1109/ICME55011.2023.00010",
language = "English",
series = "Proceedings - IEEE International Conference on Multimedia and Expo",
publisher = "IEEE Computer Society",
pages = "7--12",
booktitle = "Proceedings - 2023 IEEE International Conference on Multimedia and Expo, ICME 2023",
address = "United States",
}