@inproceedings{50bc5f079bcf425bb622bf5e3e861227,
title = "Caption Enriched Samples for Improving Hateful Memes Detection",
abstract = "The recently introduced hateful meme challenge demonstrates the difficulty of determining whether a meme is hateful or not. Specifically, both unimodal language models and multimodal vision-language models cannot reach the human level of performance. Motivated by the need to model the contrast between the image content and the overlayed text, we suggest applying an off-the-shelf image captioning tool in order to capture the first. We demonstrate that the incorporation of such automatic captions during fine-tuning improves the results for various unimodal and multimodal models. Moreover, in the unimodal case, continuing the pre-training of language models on augmented and original caption pairs, is highly beneficial to the classification accuracy. Our code is publicly available 1.",
author = "Efrat Blaier and Itzik Malkiel and Lior Wolf",
note = "Publisher Copyright: {\textcopyright} 2021 Association for Computational Linguistics; 2021 Conference on Empirical Methods in Natural Language Processing, EMNLP 2021 ; Conference date: 07-11-2021 Through 11-11-2021",
year = "2021",
doi = "10.18653/v1/2021.emnlp-main.738",
language = "الإنجليزيّة",
series = "EMNLP 2021 - 2021 Conference on Empirical Methods in Natural Language Processing, Proceedings",
publisher = "Association for Computational Linguistics (ACL)",
pages = "9350--9358",
booktitle = "EMNLP 2021 - 2021 Conference on Empirical Methods in Natural Language Processing, Proceedings",
address = "الولايات المتّحدة",
}