@inproceedings{2f2406c2ac904d06bccb1e6d3010d008,
title = "ZeroCap: Zero-Shot Image-to-Text Generation for Visual-Semantic Arithmetic",
abstract = "Recent text-to-image matching models apply contrastive learning to large corpora of uncurated pairs of images and sentences. While such models can provide a powerful score for matching and subsequent zero-shot tasks, they are not capable of generating caption given an image. In this work, we repurpose such models to generate a descriptive text given an image at inference time, without any further training or tuning step. This is done by combining the visual-semantic model with a large language model, benefiting from the knowledge in both web-scale models. The resulting captions are much less restrictive than those obtained by supervised captioning methods. Moreover, as a zero-shot learning method, it is extremely flexible and we demonstrate its ability to perform image arithmetic in which the inputs can be either images or text and the output is a sentence. This enables novel high-level vision capabilities such as comparing two images or solving visual analogy tests. Our code is available at: https://github.com/YoadTew/zero-shot-image-to-text.",
keywords = "Transfer/low-shot/long-tail learning, Vision + language",
author = "Yoad Tewel and Yoav Shalev and Idan Schwartz and Lior Wolf",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022 ; Conference date: 19-06-2022 Through 24-06-2022",
year = "2022",
doi = "10.1109/CVPR52688.2022.01739",
language = "الإنجليزيّة",
series = "Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition",
publisher = "IEEE Computer Society",
pages = "17897--17907",
booktitle = "Proceedings - 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022",
address = "الولايات المتّحدة",
}