@book{DraisbachNaumannSzottetal.2012, author = {Draisbach, Uwe and Naumann, Felix and Szott, Sascha and Wonneberg, Oliver}, title = {Adaptive windows for duplicate detection}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-143-1}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53007}, publisher = {Universit{\"a}t Potsdam}, pages = {41}, year = {2012}, abstract = {Duplicate detection is the task of identifying all groups of records within a data set that represent the same real-world entity, respectively. This task is difficult, because (i) representations might differ slightly, so some similarity measure must be defined to compare pairs of records and (ii) data sets might have a high volume making a pair-wise comparison of all records infeasible. To tackle the second problem, many algorithms have been suggested that partition the data set and compare all record pairs only within each partition. One well-known such approach is the Sorted Neighborhood Method (SNM), which sorts the data according to some key and then advances a window over the data comparing only records that appear within the same window. We propose several variations of SNM that have in common a varying window size and advancement. The general intuition of such adaptive windows is that there might be regions of high similarity suggesting a larger window size and regions of lower similarity suggesting a smaller window size. We propose and thoroughly evaluate several adaption strategies, some of which are provably better than the original SNM in terms of efficiency (same results with fewer comparisons).}, language = {en} } @book{AbedjanNaumann2011, author = {Abedjan, Ziawasch and Naumann, Felix}, title = {Advancing the discovery of unique column combinations}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-148-6}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53564}, publisher = {Universit{\"a}t Potsdam}, pages = {25}, year = {2011}, abstract = {Unique column combinations of a relational database table are sets of columns that contain only unique values. Discovering such combinations is a fundamental research problem and has many different data management and knowledge discovery applications. Existing discovery algorithms are either brute force or have a high memory load and can thus be applied only to small datasets or samples. In this paper, the wellknown GORDIAN algorithm and "Apriori-based" algorithms are compared and analyzed for further optimization. We greatly improve the Apriori algorithms through efficient candidate generation and statistics-based pruning methods. A hybrid solution HCAGORDIAN combines the advantages of GORDIAN and our new algorithm HCA, and it significantly outperforms all previous work in many situations.}, language = {en} } @book{BauckmannAbedjanLeseretal.2012, author = {Bauckmann, Jana and Abedjan, Ziawasch and Leser, Ulf and M{\"u}ller, Heiko and Naumann, Felix}, title = {Covering or complete? : Discovering conditional inclusion dependencies}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-212-4}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-62089}, publisher = {Universit{\"a}t Potsdam}, pages = {34}, year = {2012}, abstract = {Data dependencies, or integrity constraints, are used to improve the quality of a database schema, to optimize queries, and to ensure consistency in a database. In the last years conditional dependencies have been introduced to analyze and improve data quality. In short, a conditional dependency is a dependency with a limited scope defined by conditions over one or more attributes. Only the matching part of the instance must adhere to the dependency. In this paper we focus on conditional inclusion dependencies (CINDs). We generalize the definition of CINDs, distinguishing covering and completeness conditions. We present a new use case for such CINDs showing their value for solving complex data quality tasks. Further, we define quality measures for conditions inspired by precision and recall. We propose efficient algorithms that identify covering and completeness conditions conforming to given quality thresholds. Our algorithms choose not only the condition values but also the condition attributes automatically. Finally, we show that our approach efficiently provides meaningful and helpful results for our use case.}, language = {en} } @article{KossmannPapenbrockNaumann2021, author = {Koßmann, Jan and Papenbrock, Thorsten and Naumann, Felix}, title = {Data dependencies for query optimization}, series = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment}, volume = {31}, journal = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment}, number = {1}, publisher = {Springer}, address = {Berlin ; Heidelberg ; New York}, issn = {1066-8888}, doi = {10.1007/s00778-021-00676-3}, pages = {1 -- 22}, year = {2021}, abstract = {Effective query optimization is a core feature of any database management system. While most query optimization techniques make use of simple metadata, such as cardinalities and other basic statistics, other optimization techniques are based on more advanced metadata including data dependencies, such as functional, uniqueness, order, or inclusion dependencies. This survey provides an overview, intuitive descriptions, and classifications of query optimization and execution strategies that are enabled by data dependencies. We consider the most popular types of data dependencies and focus on optimization strategies that target the optimization of relational database queries. The survey supports database vendors to identify optimization opportunities as well as DBMS researchers to find related work and open research questions.}, language = {en} } @article{HameedNaumann2020, author = {Hameed, Mazhar and Naumann, Felix}, title = {Data Preparation}, series = {SIGMOD record}, volume = {49}, journal = {SIGMOD record}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0163-5808}, doi = {10.1145/3444831.3444835}, pages = {18 -- 29}, year = {2020}, abstract = {Raw data are often messy: they follow different encodings, records are not well structured, values do not adhere to patterns, etc. Such data are in general not fit to be ingested by downstream applications, such as data analytics tools, or even by data management systems. The act of obtaining information from raw data relies on some data preparation process. Data preparation is integral to advanced data analysis and data management, not only for data science but for any data-driven applications. Existing data preparation tools are operational and useful, but there is still room for improvement and optimization. With increasing data volume and its messy nature, the demand for prepared data increases day by day.
To cater to this demand, companies and researchers are developing techniques and tools for data preparation. To better understand the available data preparation systems, we have conducted a survey to investigate (1) prominent data preparation tools, (2) distinctive tool features, (3) the need for preliminary data processing even for these tools and, (4) features and abilities that are still lacking. We conclude with an argument in support of automatic and intelligent data preparation beyond traditional and simplistic techniques.}, language = {en} } @article{KoumarelasJiangNaumann2020, author = {Koumarelas, Ioannis and Jiang, Lan and Naumann, Felix}, title = {Data preparation for duplicate detection}, series = {Journal of data and information quality : (JDIQ)}, volume = {12}, journal = {Journal of data and information quality : (JDIQ)}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3377878}, pages = {24}, year = {2020}, abstract = {Data errors represent a major issue in most application workflows. Before any important task can take place, a certain data quality has to be guaranteed by eliminating a number of different errors that may appear in data. Typically, most of these errors are fixed with data preparation methods, such as whitespace removal. However, the particular error of duplicate records, where multiple records refer to the same entity, is usually eliminated independently with specialized techniques. Our work is the first to bring these two areas together by applying data preparation operations under a systematic approach prior to performing duplicate detection.
Our process workflow can be summarized as follows: It begins with the user providing as input a sample of the gold standard, the actual dataset, and optionally some constraints to domain-specific data preparations, such as address normalization. The preparation selection operates in two consecutive phases. First, to vastly reduce the search space of ineffective data preparations, decisions are made based on the improvement or worsening of pair similarities. Second, using the remaining data preparations an iterative leave-one-out classification process removes preparations one by one and determines the redundant preparations based on the achieved area under the precision-recall curve (AUC-PR). Using this workflow, we manage to improve the results of duplicate detection up to 19\% in AUC-PR.}, language = {en} } @book{AbedjanGolabNaumannetal., author = {Abedjan, Ziawasch and Golab, Lukasz and Naumann, Felix and Papenbrock, Thorsten}, title = {Data Profiling}, series = {Synthesis lectures on data management, 52}, journal = {Synthesis lectures on data management, 52}, publisher = {Morgan \& Claypool Publishers}, address = {San Rafael}, isbn = {978-1-68173-446-0}, pages = {xviii, 136}, language = {en} } @article{VitaglianoJiangNaumann2021, author = {Vitagliano, Gerardo and Jiang, Lan and Naumann, Felix}, title = {Detecting layout templates in complex multiregion files}, series = {Proceedings of the VLDB Endowment}, volume = {15}, journal = {Proceedings of the VLDB Endowment}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3494124.3494145}, pages = {646 -- 658}, year = {2021}, abstract = {Spreadsheets are among the most commonly used file formats for data management, distribution, and analysis. Their widespread employment makes it easy to gather large collections of data, but their flexible canvas-based structure makes automated analysis difficult without heavy preparation. One of the common problems that practitioners face is the presence of multiple, independent regions in a single spreadsheet, possibly separated by repeated empty cells. We define such files as "multiregion" files. In collections of various spreadsheets, we can observe that some share the same layout. We present the Mondrian approach to automatically identify layout templates across multiple files and systematically extract the corresponding regions. Our approach is composed of three phases: first, each file is rendered as an image and inspected for elements that could form regions; then, using a clustering algorithm, the identified elements are grouped to form regions; finally, every file layout is represented as a graph and compared with others to find layout templates. We compare our method to state-of-the-art table recognition algorithms on two corpora of real-world enterprise spreadsheets. Our approach shows the best performances in detecting reliable region boundaries within each file and can correctly identify recurring layouts across files.}, language = {en} } @article{CaruccioDeufemiaNaumannetal.2021, author = {Caruccio, Loredana and Deufemia, Vincenzo and Naumann, Felix and Polese, Giuseppe}, title = {Discovering relaxed functional dependencies based on multi-attribute dominance}, series = {IEEE transactions on knowledge and data engineering}, volume = {33}, journal = {IEEE transactions on knowledge and data engineering}, number = {9}, publisher = {Institute of Electrical and Electronics Engineers}, address = {New York, NY}, issn = {1041-4347}, doi = {10.1109/TKDE.2020.2967722}, pages = {3212 -- 3228}, year = {2021}, abstract = {With the advent of big data and data lakes, data are often integrated from multiple sources. Such integrated data are often of poor quality, due to inconsistencies, errors, and so forth. One way to check the quality of data is to infer functional dependencies (fds). However, in many modern applications it might be necessary to extract properties and relationships that are not captured through fds, due to the necessity to admit exceptions, or to consider similarity rather than equality of data values. Relaxed fds (rfds) have been introduced to meet these needs, but their discovery from data adds further complexity to an already complex problem, also due to the necessity of specifying similarity and validity thresholds. We propose Domino, a new discovery algorithm for rfds that exploits the concept of dominance in order to derive similarity thresholds of attribute values while inferring rfds. An experimental evaluation on real datasets demonstrates the discovery performance and the effectiveness of the proposed algorithm.}, language = {en} } @book{BauckmannLeserNaumann2010, author = {Bauckmann, Jana and Leser, Ulf and Naumann, Felix}, title = {Efficient and exact computation of inclusion dependencies for data integration}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-048-9}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-41396}, publisher = {Universit{\"a}t Potsdam}, pages = {36}, year = {2010}, abstract = {Data obtained from foreign data sources often come with only superficial structural information, such as relation names and attribute names. Other types of metadata that are important for effective integration and meaningful querying of such data sets are missing. In particular, relationships among attributes, such as foreign keys, are crucial metadata for understanding the structure of an unknown database. The discovery of such relationships is difficult, because in principle for each pair of attributes in the database each pair of data values must be compared. A precondition for a foreign key is an inclusion dependency (IND) between the key and the foreign key attributes. We present with Spider an algorithm that efficiently finds all INDs in a given relational database. It leverages the sorting facilities of DBMS but performs the actual comparisons outside of the database to save computation. Spider analyzes very large databases up to an order of magnitude faster than previous approaches. We also evaluate in detail the effectiveness of several heuristics to reduce the number of necessary comparisons. Furthermore, we generalize Spider to find composite INDs covering multiple attributes, and partial INDs, which are true INDs for all but a certain number of values. This last type is particularly relevant when integrating dirty data as is often the case in the life sciences domain - our driving motivation.}, language = {en} }