@inproceedings{be4ff54f7f154bb99b86a0de3155c762,
title = "An Analysis of BPE Vocabulary Trimming in Neural Machine Translation",
abstract = "We explore threshold vocabulary trimming in Byte-Pair Encoding subword tokenization, a tokenization postprocessing step that replaces rare subwords with their component subwords. The technique is available in popular tokenization libraries but has not been subjected to rigorous scientific scrutiny. While the removal of rare subwords is suggested as best practice in model implementations, both as a means to reduce model size and for improving model performance through robustness, our experiments indicate that, across a large space of hyperparameter settings, vocabulary trimming fails to consistently improve model performance, and is even prone to incurring heavy degradation.",
author = "Marco Cognetta and Tatsuya Hiraoka and Naoaki Okazaki and Rico Sennrich and Yuval Pinter",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; 5th Workshop on Insights from Negative Results in NLP, Insights 2024 ; Conference date: 20-06-2024",
year = "2024",
month = jan,
day = "1",
language = "American English",
series = "Insights 2024 - 5th Workshop on Insights from Negative Results in NLP, Proceedings of the Workshop",
publisher = "Association for Computational Linguistics (ACL)",
pages = "48--50",
editor = "Shabnam Tafreshi and Akula, {Arjun Reddy} and Joao Sedoc and Aleksandr Drozd and Anna Rogers and Anna Rumshisky",
booktitle = "Insights 2024 - 5th Workshop on Insights from Negative Results in NLP, Proceedings of the Workshop",
address = "United States",
}