@book{DraisbachNaumannSzottetal.2012, author = {Draisbach, Uwe and Naumann, Felix and Szott, Sascha and Wonneberg, Oliver}, title = {Adaptive windows for duplicate detection}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-143-1}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53007}, publisher = {Universit{\"a}t Potsdam}, pages = {41}, year = {2012}, abstract = {Duplicate detection is the task of identifying all groups of records within a data set that represent the same real-world entity, respectively. This task is difficult, because (i) representations might differ slightly, so some similarity measure must be defined to compare pairs of records and (ii) data sets might have a high volume making a pair-wise comparison of all records infeasible. To tackle the second problem, many algorithms have been suggested that partition the data set and compare all record pairs only within each partition. One well-known such approach is the Sorted Neighborhood Method (SNM), which sorts the data according to some key and then advances a window over the data comparing only records that appear within the same window. We propose several variations of SNM that have in common a varying window size and advancement. The general intuition of such adaptive windows is that there might be regions of high similarity suggesting a larger window size and regions of lower similarity suggesting a smaller window size. We propose and thoroughly evaluate several adaption strategies, some of which are provably better than the original SNM in terms of efficiency (same results with fewer comparisons).}, language = {en} } @book{AbedjanNaumann2011, author = {Abedjan, Ziawasch and Naumann, Felix}, title = {Advancing the discovery of unique column combinations}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-148-6}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53564}, publisher = {Universit{\"a}t Potsdam}, pages = {25}, year = {2011}, abstract = {Unique column combinations of a relational database table are sets of columns that contain only unique values. Discovering such combinations is a fundamental research problem and has many different data management and knowledge discovery applications. Existing discovery algorithms are either brute force or have a high memory load and can thus be applied only to small datasets or samples. In this paper, the wellknown GORDIAN algorithm and "Apriori-based" algorithms are compared and analyzed for further optimization. We greatly improve the Apriori algorithms through efficient candidate generation and statistics-based pruning methods. A hybrid solution HCAGORDIAN combines the advantages of GORDIAN and our new algorithm HCA, and it significantly outperforms all previous work in many situations.}, language = {en} } @article{HackerNaumannFriedrichetal.2022, author = {Hacker, Philipp and Naumann, Felix and Friedrich, Tobias and Grundmann, Stefan and Lehmann, Anja and Zech, Herbert}, title = {AI compliance - challenges of bridging data science and law}, series = {Journal of Data and Information Quality (JDIQ)}, volume = {14}, journal = {Journal of Data and Information Quality (JDIQ)}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3531532}, pages = {4}, year = {2022}, abstract = {This vision article outlines the main building blocks of what we term AI Compliance, an effort to bridge two complementary research areas: computer science and the law. Such research has the goal to model, measure, and affect the quality of AI artifacts, such as data, models, and applications, to then facilitate adherence to legal standards.}, language = {en} } @article{AbramowskiAceroAharonianetal.2013, author = {Abramowski, Attila and Acero, F. and Aharonian, Felix A. and Benkhali, Faical Ait and Akhperjanian, A. G. and Ang{\"u}ner, Ekrem Oǧuzhan and Anton, Gisela and Balenderan, Shangkari and Balzer, Arnim and Barnacka, Anna and Becherini, Yvonne and Tjus, J. Becker and Bernl{\"o}hr, K. and Birsin, E. and Bissaldi, E. and Biteau, Jonathan and Boisson, Catherine and Bolmont, J. and Bordas, Pol and Brucker, J. and Brun, Francois and Brun, Pierre and Bulik, Tomasz and Carrigan, Svenja and Casanova, Sabrina and Cerruti, M. and Chadwick, Paula M. and Chalme-Calvet, R. and Chaves, Ryan C. G. and Cheesebrough, A. and Chretien, M. and Colafrancesco, Sergio and Cologna, Gabriele and Conrad, Jan and Couturier, C. and Dalton, M. and Daniel, M. K. and Davids, I. D. and Degrange, B. and Deil, C. and deWilt, P. and Dickinson, H. J. and Djannati-Ata{\"i}, A. and Domainko, W. and Drury, L. O'C. and Dubus, G. and Dutson, K. and Dyks, J. and Dyrda, M. and Edwards, T. and Egberts, Kathrin and Eger, P. and Espigat, P. and Farnier, C. and Fegan, S. and Feinstein, F. and Fernandes, M. V. and Fernandez, D. and Fiasson, A. and Fontaine, G. and Foerster, A. and Fuessling, M. and Gajdus, M. and Gallant, Y. A. and Garrigoux, T. and Gast, H. and Giebels, B. and Glicenstein, J. F. and Goering, D. and Grondin, M. -H. and Grudzinska, M. and Haeffner, S. and Hague, J. D. and Hahn, J. and Harris, J. and Heinzelmann, G. and Henri, G. and Hermann, G. and Hervet, O. and Hillert, A. and Hinton, James Anthony and Hofmann, W. and Hofverberg, P. and Holler, Markus and Horns, D. and Jacholkowska, A. and Jahn, C. and Jamrozy, M. and Janiak, M. and Jankowsky, F. and Jung, I. and Kastendieck, M. A. and Katarzynski, K. and Katz, U. and Kaufmann, S. and Khelifi, B. and Kieffer, M. and Klepser, S. and Klochkov, D. and Kluzniak, W. and Kneiske, T. and Kolitzus, D. and Komin, Nu. and Kosack, K. and Krakau, S. and Krayzel, F. and Krueger, P. P. and Laffon, H. and Lamanna, G. and Lefaucheur, J. and Lemoine-Goumard, M. and Lenain, J-P. and Lennarz, D. and Lohse, T. and Lopatin, A. and Lu, C-C. and Marandon, V. and Marcowith, Alexandre and Marx, R. and Maurin, G. and Maxted, N. and Mayer, M. and McComb, T. J. L. and Medina, M. C. and Mehault, J. and Menzler, U. and Meyer, M. and Moderski, R. and Mohamed, M. and Moulin, Emmanuel and Murach, T. and Naumann, C. L. and de Naurois, M. and Nedbal, D. and Niemiec, J. and Nolan, S. J. and Oakes, L. and Ohm, S. and Wilhelmi, E. de Ona and Opitz, B. and Ostrowski, M. and Oya, I. and Panter, M. and Parsons, R. D. and Arribas, M. Paz and Pekeur, N. W. and Pelletier, G. and Perez, J. and Petrucci, P-O. and Peyaud, B. and Pita, S. and Poon, H. and Puehlhofer, G. and Punch, M. and Quirrenbach, A. and Raab, S. and Raue, M. and Reimer, A. and Reimer, O. and Renaud, M. and de los Reyes, R. and Rieger, F. and Rob, L. and Rosier-Lees, S. and Rowell, G. and Rudak, B. and Rulten, C. B. and Sahakian, V. and Sanchez, David M. and Santangelo, Andrea and Schlickeiser, R. and Schuessler, F. and Schulz, A. and Schwanke, U. and Schwarzburg, S. and Schwemmer, S. and Sol, H. and Spengler, G. and Spiess, F. and Stawarz, L. and Steenkamp, R. and Stegmann, Christian and Stinzing, F. and Stycz, K. and Sushch, Iurii and Szostek, A. and Tavernet, J-P. and Terrier, R. and Tluczykont, M. and Trichard, C. and Valerius, K. and van Eldik, C. and Vasileiadis, G. and Venter, C. and Viana, A. and Vincent, P. and Voelk, H. J. and Volpe, F. and Vorster, M. and Wagner, S. J. and Wagner, P. and Ward, M. and Weidinger, M. and Weitzel, Q. and White, R. and Wierzcholska, A. and Willmann, P. and Woernlein, A. and Wouters, D. and Zacharias, M. and Zajczyk, A. and Zdziarski, A. A. and Zech, Alraune and Zechlin, H-S.}, title = {Constraints on axionlike particles with HESS from the irregularity of the PKS 2155-304 energy spectrum}, series = {Physical review : D, Particles, fields, gravitation, and cosmology}, volume = {88}, journal = {Physical review : D, Particles, fields, gravitation, and cosmology}, number = {10}, publisher = {American Physical Society}, address = {College Park}, organization = {HESS Collaboration}, issn = {1550-7998}, doi = {10.1103/PhysRevD.88.102003}, pages = {12}, year = {2013}, abstract = {Axionlike particles (ALPs) are hypothetical light (sub-eV) bosons predicted in some extensions of the Standard Model of particle physics. In astrophysical environments comprising high-energy gamma rays and turbulent magnetic fields, the existence of ALPs can modify the energy spectrum of the gamma rays for a sufficiently large coupling between ALPs and photons. This modification would take the form of an irregular behavior of the energy spectrum in a limited energy range. Data from the H. E. S. S. observations of the distant BL Lac object PKS 2155 - 304 (z = 0.116) are used to derive upper limits at the 95\% C. L. on the strength of the ALP coupling to photons, g(gamma a) < 2.1 x 10(-11) GeV-1 for an ALP mass between 15 and 60 neV. The results depend on assumptions on the magnetic field around the source, which are chosen conservatively. The derived constraints apply to both light pseudoscalar and scalar bosons that couple to the electromagnetic field.}, language = {en} } @book{BauckmannAbedjanLeseretal.2012, author = {Bauckmann, Jana and Abedjan, Ziawasch and Leser, Ulf and M{\"u}ller, Heiko and Naumann, Felix}, title = {Covering or complete? : Discovering conditional inclusion dependencies}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-212-4}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-62089}, publisher = {Universit{\"a}t Potsdam}, pages = {34}, year = {2012}, abstract = {Data dependencies, or integrity constraints, are used to improve the quality of a database schema, to optimize queries, and to ensure consistency in a database. In the last years conditional dependencies have been introduced to analyze and improve data quality. In short, a conditional dependency is a dependency with a limited scope defined by conditions over one or more attributes. Only the matching part of the instance must adhere to the dependency. In this paper we focus on conditional inclusion dependencies (CINDs). We generalize the definition of CINDs, distinguishing covering and completeness conditions. We present a new use case for such CINDs showing their value for solving complex data quality tasks. Further, we define quality measures for conditions inspired by precision and recall. We propose efficient algorithms that identify covering and completeness conditions conforming to given quality thresholds. Our algorithms choose not only the condition values but also the condition attributes automatically. Finally, we show that our approach efficiently provides meaningful and helpful results for our use case.}, language = {en} } @misc{LosterNaumannEhmuelleretal.2018, author = {Loster, Michael and Naumann, Felix and Ehmueller, Jan and Feldmann, Benjamin}, title = {CurEx}, series = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, journal = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, publisher = {Association for Computing Machinery}, address = {New York}, isbn = {978-1-4503-6014-2}, doi = {10.1145/3269206.3269229}, pages = {1883 -- 1886}, year = {2018}, abstract = {The integration of diverse structured and unstructured information sources into a unified, domain-specific knowledge base is an important task in many areas. A well-maintained knowledge base enables data analysis in complex scenarios, such as risk analysis in the financial sector or investigating large data leaks, such as the Paradise or Panama papers. Both the creation of such knowledge bases, as well as their continuous maintenance and curation involves many complex tasks and considerable manual effort. With CurEx, we present a modular system that allows structured and unstructured data sources to be integrated into a domain-specific knowledge base. In particular, we (i) enable the incremental improvement of each individual integration component; (ii) enable the selective generation of multiple knowledge graphs from the information contained in the knowledge base; and (iii) provide two distinct user interfaces tailored to the needs of data engineers and end-users respectively. The former has curation capabilities and controls the integration process, whereas the latter focuses on the exploration of the generated knowledge graph.}, language = {en} } @article{KossmannPapenbrockNaumann2021, author = {Koßmann, Jan and Papenbrock, Thorsten and Naumann, Felix}, title = {Data dependencies for query optimization}, series = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment}, volume = {31}, journal = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment}, number = {1}, publisher = {Springer}, address = {Berlin ; Heidelberg ; New York}, issn = {1066-8888}, doi = {10.1007/s00778-021-00676-3}, pages = {1 -- 22}, year = {2021}, abstract = {Effective query optimization is a core feature of any database management system. While most query optimization techniques make use of simple metadata, such as cardinalities and other basic statistics, other optimization techniques are based on more advanced metadata including data dependencies, such as functional, uniqueness, order, or inclusion dependencies. This survey provides an overview, intuitive descriptions, and classifications of query optimization and execution strategies that are enabled by data dependencies. We consider the most popular types of data dependencies and focus on optimization strategies that target the optimization of relational database queries. The survey supports database vendors to identify optimization opportunities as well as DBMS researchers to find related work and open research questions.}, language = {en} } @article{HameedNaumann2020, author = {Hameed, Mazhar and Naumann, Felix}, title = {Data Preparation}, series = {SIGMOD record}, volume = {49}, journal = {SIGMOD record}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0163-5808}, doi = {10.1145/3444831.3444835}, pages = {18 -- 29}, year = {2020}, abstract = {Raw data are often messy: they follow different encodings, records are not well structured, values do not adhere to patterns, etc. Such data are in general not fit to be ingested by downstream applications, such as data analytics tools, or even by data management systems. The act of obtaining information from raw data relies on some data preparation process. Data preparation is integral to advanced data analysis and data management, not only for data science but for any data-driven applications. Existing data preparation tools are operational and useful, but there is still room for improvement and optimization. With increasing data volume and its messy nature, the demand for prepared data increases day by day.
To cater to this demand, companies and researchers are developing techniques and tools for data preparation. To better understand the available data preparation systems, we have conducted a survey to investigate (1) prominent data preparation tools, (2) distinctive tool features, (3) the need for preliminary data processing even for these tools and, (4) features and abilities that are still lacking. We conclude with an argument in support of automatic and intelligent data preparation beyond traditional and simplistic techniques.}, language = {en} } @article{KoumarelasJiangNaumann2020, author = {Koumarelas, Ioannis and Jiang, Lan and Naumann, Felix}, title = {Data preparation for duplicate detection}, series = {Journal of data and information quality : (JDIQ)}, volume = {12}, journal = {Journal of data and information quality : (JDIQ)}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3377878}, pages = {24}, year = {2020}, abstract = {Data errors represent a major issue in most application workflows. Before any important task can take place, a certain data quality has to be guaranteed by eliminating a number of different errors that may appear in data. Typically, most of these errors are fixed with data preparation methods, such as whitespace removal. However, the particular error of duplicate records, where multiple records refer to the same entity, is usually eliminated independently with specialized techniques. Our work is the first to bring these two areas together by applying data preparation operations under a systematic approach prior to performing duplicate detection.
Our process workflow can be summarized as follows: It begins with the user providing as input a sample of the gold standard, the actual dataset, and optionally some constraints to domain-specific data preparations, such as address normalization. The preparation selection operates in two consecutive phases. First, to vastly reduce the search space of ineffective data preparations, decisions are made based on the improvement or worsening of pair similarities. Second, using the remaining data preparations an iterative leave-one-out classification process removes preparations one by one and determines the redundant preparations based on the achieved area under the precision-recall curve (AUC-PR). Using this workflow, we manage to improve the results of duplicate detection up to 19\% in AUC-PR.}, language = {en} } @book{AbedjanGolabNaumannetal., author = {Abedjan, Ziawasch and Golab, Lukasz and Naumann, Felix and Papenbrock, Thorsten}, title = {Data Profiling}, series = {Synthesis lectures on data management, 52}, journal = {Synthesis lectures on data management, 52}, publisher = {Morgan \& Claypool Publishers}, address = {San Rafael}, isbn = {978-1-68173-446-0}, pages = {xviii, 136}, language = {en} }