@inproceedings{fcbb8223b26b4a7da746e01f30b703cb,
title = "Towards hypothetical reasoning using distributed provenance",
abstract = "Hypothetical reasoning is the iterative examination of the effect of modifications to the data on the result of some computation or data analysis query. This kind of reasoning is commonly performed by data scientists to gain insights. Previous work has indicated that fine-grained data provenance can be instrumental for the efficient performance of hypothetical reasoning: instead of a costly re-execution of the underlying application, one may assign values to a pre-computed provenance expression. However, current techniques for fine-grained provenance tracking are ill-suited for large-scale data due to the overhead they entail on both execution time and memory consumption. We outline an approach for hypothetical reasoning for large-scale data. Our key insights are: (i) tracking only relevant parts of the provenance based on an a priori specification of classes of hypothetical scenarios that are of interest and (ii) the distributed tracking of provenance tailored to fit distributed data processing frameworks such as Apache Spark. We also discuss the challenges in both respects and our initial directions for addressing them.",
author = "Daniel Deutch and Yuval Moskovitch and {Polack Gadassi}, Itay and Noam Rinetzky",
note = "Publisher Copyright: {\textcopyright} 2018 Copyright held by the owner/author(s); 21st International Conference on Extending Database Technology, EDBT 2018 ; Conference date: 26-03-2018 Through 29-03-2018",
year = "2018",
month = jan,
day = "1",
doi = "https://doi.org/10.5441/002/edbt.2018.47",
language = "الإنجليزيّة",
series = "Advances in Database Technology - EDBT",
pages = "461--464",
editor = "Michael Bohlen and Reinhard Pichler and Norman May and Erhard Rahm and Shan-Hung Wu and Katja Hose",
booktitle = "Advances in Database Technology - EDBT 2018",
}