@inproceedings{c45a58de3ec447f3b49ddfaf4d09da03,
title = "Backward Lens: Projecting Language Model Gradients into the Vocabulary Space",
abstract = "Understanding how Transformer-based Language Models (LMs) learn and recall information is a key goal of the deep learning community. Recent interpretability methods project weights and hidden states obtained from the forward pass to the models' vocabularies, helping to uncover how information flows within LMs. In this work, we extend this methodology to LMs' backward pass and gradients. We first prove that a gradient matrix can be cast as a low-rank linear combination of its forward and backward passes' inputs. We then develop methods to project these gradients into vocabulary items and explore the mechanics of how new information is stored in the LMs' neurons. Our code is available at: https://github.com/shacharKZ/BackwardLens.",
author = "Shahar Katz and Yonatan Belinkov and Mor Geva and Lior Wolf",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; 2024 Conference on Empirical Methods in Natural Language Processing, EMNLP 2024 ; Conference date: 12-11-2024 Through 16-11-2024",
year = "2024",
doi = "10.18653/v1/2024.emnlp-main.142",
language = "الإنجليزيّة",
series = "EMNLP 2024 - 2024 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference",
publisher = "Association for Computational Linguistics (ACL)",
pages = "2390--2422",
editor = "Yaser Al-Onaizan and Mohit Bansal and Yun-Nung Chen",
booktitle = "EMNLP 2024 - 2024 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference",
address = "الولايات المتّحدة",
}