@inproceedings{49a104ccf070431ca6032c60149391ca,
title = "Localization-Guided Supervision for Robust Medical Image Classification by Vision Transformers",
abstract = "A major challenge in developing data-driven algorithms for medical imaging is the limited size of available datasets. Furthermore, these datasets often suffer from inter-site heterogeneity caused by the use of different scanners and scanning protocols. These factors may contribute to overfitting, which undermines the generalization ability and robustness of deep learning classification models in the medical domain, leading to inadequate performance in real-world applications. To address these challenges and mitigate overfitting, we propose a framework which incorporates explanation supervision during training of Vision Transformer (ViT) models for image classification. Our approach leverages foreground masks of the class object during training to regularize attribution maps extracted from ViT, encouraging the model to focus on relevant image regions and make predictions based on pertinent features. We introduce a new method for generating explanatory attribution maps from ViT-based models and construct a dual-loss function that combines a conventional classification loss with a term that regularizes attribution maps. Our approach demonstrates superior performance over existing methods on two challenging medical imaging datasets, highlighting its effectiveness in the medical domain and its potential for application in other fields. Source code is available at: https://github.com/sagibe/LGMViT.",
keywords = "Attention, Explainability, Explainable AI, Explanation supervision, Image classification, Medical imaging, Vision Transformer",
author = "{Ben Itzhak}, Sagi and Nahum Kiryati and Orith Portnoy and Arnaldo Mayer",
note = "Publisher Copyright: {\textcopyright} The Author(s) 2025.; Workshops that were held in conjunction with the 18th European Conference on Computer Vision, ECCV 2024 ; Conference date: 29-09-2024 Through 04-10-2024",
year = "2025",
doi = "10.1007/978-3-031-92648-8_8",
language = "الإنجليزيّة",
isbn = "9783031926471",
series = "Lecture Notes in Computer Science",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "118--133",
editor = "{Del Bue}, Alessio and Cristian Canton and Jordi Pont-Tuset and Tatiana Tommasi",
booktitle = "Computer Vision – ECCV 2024 Workshops, Proceedings",
address = "ألمانيا",
}