@inproceedings{f0678c60ccf840e286c8546bf705907a,
title = "Linguistic Knowledge Within Handwritten Text Recognition Models: A Real-World Case Study",
abstract = "State-of-the-art handwritten text recognition models make frequent use of deep neural networks, with recurrent and connectionist temporal classification layers, which perform recognition over sequences of characters. This architecture may lead to the model learning statistical linguistic features of the training corpus, over and above graphic features. This in turn could lead to degraded performance if the evaluation dataset language differs from the training corpus language. We present a fundamental study aiming to understand the inner workings of OCR models and further our understanding of the use of RNNs as decoders. We examine a real-world example of two graphically similar medieval documents but in different languages: rabbinical Hebrew and Judeo-Arabic. We analyze, computationally and linguistically, the cross-language performance of the models over these documents, so as to gain some insight into the implicit language knowledge the models may have acquired. We find that the implicit language model impacts the final word error by around 10%. A combined qualitative and quantitative analysis allow us to isolate manifest linguistic hallucinations. However, we show that leveraging a pretrained (Hebrew, in our case) model allows one to boost the OCR accuracy for a resource-scarce language (such as Judeo-Arabic). All our data, code, and models are openly available at https://github.com/anutkk/ilmja.",
keywords = "Handwritten text recognition, Hebrew manuscripts, Language model, Optical character recognition, Transfer learning",
author = "Samuel Londner and Yoav Phillips and Hadar Miller and Nachum Dershowitz and Tsvi Kuflik and Moshe Lavee",
note = "Publisher Copyright: {\textcopyright} 2023, The Author(s), under exclusive license to Springer Nature Switzerland AG.; 17th International Conference on Document Analysis and Recognition, ICDAR 2023 ; Conference date: 21-08-2023 Through 26-08-2023",
year = "2023",
doi = "https://doi.org/10.1007/978-3-031-41685-9_10",
language = "American English",
isbn = "9783031416842",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "147--164",
editor = "Fink, {Gernot A.} and Rajiv Jain and Koichi Kise and Richard Zanibbi",
booktitle = "Document Analysis and Recognition – ICDAR 2023 - 17th International Conference, Proceedings",
address = "Germany",
}