@inproceedings{5efaf60477a84cef8a7f23a8d5c47afb,
title = "Discovering reliable correlations in categorical data",
abstract = "In many scientific tasks we are interested in finding correlations in our data. This raises many questions, such as how to reliably and interpretably measure correlation between a multivariate set of attributes, how to do so without having to make assumptions on data distribution or the type of correlation, and, how to search efficiently for the most correlated attribute sets. We answer these questions for discovery tasks with categorical data. In particular, we propose a corrected-for-chance, consistent, and efficient estimator for normalized total correlation, in order to obtain a reliable, interpretable, and non-parametric measure for correlation over multivariate sets. For the discovery of the top-k correlated sets, we derive an effective algorithmic framework based on a tight bounding function. This framework offers exact, approximate, and heuristic search. Empirical evaluation shows that already for small sample sizes the estimator leads to low-regret optimization outcomes, while the algorithms are shown to be highly effective for both large and high-dimensional data. Through a case study we confirm that our discovery framework identifies interesting and meaningful correlations.",
keywords = "Branch-and-bound, Information theory, Knowledge discovery, Optimization, Total correlation",
author = "Panagiotis Mandros and Mario Boley and Jilles Vreeken",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 19th IEEE International Conference on Data Mining, ICDM 2019 ; Conference date: 08-11-2019 Through 11-11-2019",
year = "2019",
month = nov,
doi = "https://doi.org/10.1109/ICDM.2019.00156",
language = "American English",
series = "Proceedings - IEEE International Conference on Data Mining, ICDM",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1252--1257",
editor = "Jianyong Wang and Kyuseok Shim and Xindong Wu",
booktitle = "Proceedings - 19th IEEE International Conference on Data Mining, ICDM 2019",
address = "United States",
}