@inproceedings{eb934585ced944639014aa5192650d71,
title = "Exploring Entity-Level Spatial Relationships for Image-Text Matching",
abstract = "Exploring the entity-level (i.e., objects in an image, words in a text) spatial relationship contributes to understanding multimedia content precisely. The ignorance of spatial information in previous works probably leads to misunderstandings of image contents. For instance, sentences 'Boats are on the water' and 'Boats are under the water' describe the same objects, but correspond to different sceneries. To this end, we utilize the relative position of objects to capture entity-level spatial relationships for image-text matching. Specifically, we fuse semantic and spatial relationships of image objects in a visual intra-modal relation module. The module performs promisingly to understand image contents and improve object representation learning. It contributes to capturing entity-level latent correspondence of image-text pairs. Then the query (text) plays a role of textual context to refine the interpretable alignments of image-text pairs in the inter-modal relation module. Our proposed method achieves state-of-the-art results on MSCOCO and Flickr30K datasets.",
keywords = "Deep learning, entity-level relation, image-text matching, relative position",
author = "Yaxian Xia and Lun Huang and Wenmin Wang and Wei, {Xiao Yong} and Jie Chen",
note = "Publisher Copyright: {\textcopyright} 2020 IEEE.; 2020 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2020 ; Conference date: 04-05-2020 Through 08-05-2020",
year = "2020",
month = may,
doi = "10.1109/ICASSP40776.2020.9054758",
language = "English",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "4452--4456",
booktitle = "2020 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2020 - Proceedings",
address = "United States",
}