@inproceedings{3d931e2849094f73b1693dcfa33c26dc,
title = "Compact universal k-mer hitting sets",
abstract = "We address the problem of finding a minimum-size set of k-mers that hits L-long sequences. The problem arises in the design of compact hash functions and other data structures for efficient handling of large sequencing datasets. We prove that the problem of hitting a given set of L-long sequences is NP-hard and give a heuristic solution that finds a compact universal k-mer set that hits any set of L-long sequences. The algorithm, called DOCKS (design of compact k-mer sets), works in two phases: (i) finding a minimum-size k-mer set that hits every infinite sequence; (ii) greedily adding k-mers such that together they hit all remaining L-long sequences. We show that DOCKS works well in practice and produces a set of k-mers that is much smaller than a random choice of k-mers. We present results for various values of k and sequence lengths L and by applying them to two bacterial genomes show that universal hitting k-mers improve on minimizers. The software and exemplary sets are freely available at acgt.cs.tau.ac.il/docks/.",
author = "Yaron Orenstein and David Pellow and Guillaume Mar{\c c}ais and Ron Shamir and Carl Kingsford",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing Switzerland 2016.; 16th International Workshop on Algorithms in Bioinformatics, WABI 2016 ; Conference date: 22-08-2016 Through 24-08-2016",
year = "2016",
month = jan,
day = "1",
doi = "https://doi.org/10.1007/978-3-319-43681-4_21",
language = "الإنجليزيّة",
isbn = "9783319436807",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "257--268",
editor = "Martin Frith and Pedersen, {Christian N{\o}rgaard Storm}",
booktitle = "Algorithms in Bioinformatics - 16th International Workshop, WABI 2016, Proceedings",
address = "ألمانيا",
}