@inproceedings{1ab9fe564c2d438ca9590479f850b380,
title = "Contrastive Learning for Weakly Supervised Phrase Grounding",
abstract = "Phrase grounding, the problem of associating image regions to caption words, is a crucial component of vision-language tasks. We show that phrase grounding can be learned by optimizing word-region attention to maximize a lower bound on mutual information between images and caption words. Given pairs of images and captions, we maximize compatibility of the attention-weighted regions and the words in the corresponding caption, compared to non-corresponding pairs of images and captions. A key idea is to construct effective negative captions for learning through language model guided word substitutions. Training with our negatives yields a ∼ 10 % absolute gain in accuracy over randomly-sampled negatives from the training data. Our weakly supervised phrase grounding model trained on COCO-Captions shows a healthy gain of 5.7 % to achieve 76.7 % accuracy on Flickr30K Entities benchmark. Our code and project material will be available at http://tanmaygupta.info/info-ground.",
keywords = "Attention, Grounding, InfoNCE, Mutual information",
author = "Tanmay Gupta and Arash Vahdat and Gal Chechik and Xiaodong Yang and Jan Kautz and Derek Hoiem",
note = "Publisher Copyright: {\textcopyright} 2020, Springer Nature Switzerland AG.; 16th European Conference on Computer Vision, ECCV 2020 ; Conference date: 23-08-2020 Through 28-08-2020",
year = "2020",
doi = "https://doi.org/10.1007/978-3-030-58580-8_44",
language = "الإنجليزيّة",
isbn = "9783030585792",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "752--768",
editor = "Andrea Vedaldi and Horst Bischof and Thomas Brox and Jan-Michael Frahm",
booktitle = "Computer Vision – ECCV 2020 - 16th European Conference 2020, Proceedings",
address = "ألمانيا",
}