@inproceedings{212b581e64bd40de9cec61fd6c37b635,
title = "Discofuse: A large-scale dataset for discourse-based sentence fusion",
abstract = "Sentence fusion is the task of joining several independent sentences into a single coherent text. Current datasets for sentence fusion are small and insufficient for training modern neural models. In this paper, we propose a method for automatically-generating fusion examples from raw text and present DISCOFUSE, a large scale dataset for discourse-based sentence fusion. We author a set of rules for identifying a diverse set of discourse phenomena in raw text, and decomposing the text into two independent sentences. We apply our approach on two document collections: Wikipedia and Sports articles, yielding 60 million fusion examples annotated with discourse information required to reconstruct the fused text. We develop a sequence-to-sequence model on DISCOFUSE and thoroughly analyze its strengths and weaknesses with respect to the various discourse phenomena, using both automatic as well as human evaluation. Finally, we conduct transfer learning experiments with WEB-SPLIT, a recent dataset for text simplification. We show that pretraining on DISCOFUSE substantially improves performance on WEB-SPLIT when viewed as a sentence fusion task.",
author = "Mor Geva and Eric Malmi and Idan Szpektor and Jonathan Berant",
note = "Publisher Copyright: {\textcopyright} 2019 Association for Computational Linguistics; 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL HLT 2019 ; Conference date: 02-06-2019 Through 07-06-2019",
year = "2019",
language = "الإنجليزيّة",
series = "NAACL HLT 2019 - 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies - Proceedings of the Conference",
publisher = "Association for Computational Linguistics (ACL)",
pages = "3443--3455",
booktitle = "Long and Short Papers",
address = "الولايات المتّحدة",
}