@inproceedings{04a5d031b2064842bbd86a1e27965712,
title = "Entity-aware and Motion-aware Transformers for Language-driven Action Localization",
abstract = "Language-driven action localization in videos is a challenging task that involves not only visual-linguistic matching but also action boundary prediction. Recent progress has been achieved through aligning language query to video segments, but estimating precise boundaries is still under-explored. In this paper, we propose entity-aware and motion-aware Transformers that progressively localizes actions in videos by first coarsely locating clips with entity queries and then finely predicting exact boundaries in a shrunken temporal region with motion queries. The entity-aware Transformer incorporates the textual entities into visual representation learning via cross-modal and cross-frame attentions to facilitate attending action-related video clips. The motion-aware Transformer captures fine-grained motion changes at multiple temporal scales via integrating long short-term memory into the self-attention module to further improve the precision of action boundary prediction. Extensive experiments on the Charades-STA and TACoS datasets demonstrate that our method achieves better performance than existing methods.",
author = "Shuo Yang and Xinxiao Wu",
note = "Publisher Copyright: {\textcopyright} 2022 International Joint Conferences on Artificial Intelligence. All rights reserved.; 31st International Joint Conference on Artificial Intelligence, IJCAI 2022 ; Conference date: 23-07-2022 Through 29-07-2022",
year = "2022",
language = "English",
series = "IJCAI International Joint Conference on Artificial Intelligence",
publisher = "International Joint Conferences on Artificial Intelligence",
pages = "1552--1558",
editor = "{De Raedt}, Luc and {De Raedt}, Luc",
booktitle = "Proceedings of the 31st International Joint Conference on Artificial Intelligence, IJCAI 2022",
}