@article{BleifussBornemannJohnsonetal.2018, author = {Bleifuss, Tobias and Bornemann, Leon and Johnson, Theodore and Kalashnikov, Dmitri and Naumann, Felix and Srivastava, Divesh}, title = {Exploring Change}, series = {Proceedings of the VLDB Endowment}, volume = {12}, journal = {Proceedings of the VLDB Endowment}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3282495.3282496}, pages = {85 -- 98}, year = {2018}, abstract = {Data and metadata in datasets experience many different kinds of change. Values axe inserted, deleted or updated; rows appear and disappear; columns are added or repurposed, etc. In such a dynamic situation, users might have many questions related to changes in the dataset, for instance which parts of the data are trustworthy and which are not? Users will wonder: How many changes have there been in the recent minutes, days or years? What kind of changes were made at which points of time? How dirty is the data? Is data cleansing required? The fact that data changed can hint at different hidden processes or agendas: a frequently crowd-updated city name may be controversial; a person whose name has been recently changed may be the target of vandalism; and so on. We show various use cases that benefit from recognizing and exploring such change. We envision a system and methods to interactively explore such change, addressing the variability dimension of big data challenges. To this end, we propose a model to capture change and the process of exploring dynamic data to identify salient changes. We provide exploration primitives along with motivational examples and measures for the volatility of data. We identify technical challenges that need to be addressed to make our vision a reality, and propose directions of future work for the data management community.}, language = {en} } @article{BertiEquilleHarmouchNaumannetal.2018, author = {Berti-Equille, Laure and Harmouch, Nazar and Naumann, Felix and Novelli, Noel and Saravanan, Thirumuruganathan}, title = {Discovery of genuine functional dependencies from relational data with missing values}, series = {Proceedings of the VLDB Endowment}, volume = {11}, journal = {Proceedings of the VLDB Endowment}, number = {8}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3204028.3204032}, pages = {880 -- 892}, year = {2018}, abstract = {Functional dependencies (FDs) play an important role in maintaining data quality. They can be used to enforce data consistency and to guide repairs over a database. In this work, we investigate the problem of missing values and its impact on FD discovery. When using existing FD discovery algorithms, some genuine FDs could not be detected precisely due to missing values or some non-genuine FDs can be discovered even though they are caused by missing values with a certain NULL semantics. We define a notion of genuineness and propose algorithms to compute the genuineness score of a discovered FD. This can be used to identify the genuine FDs among the set of all valid dependencies that hold on the data. We evaluate the quality of our method over various real-world and semi-synthetic datasets with extensive experiments. The results show that our method performs well for relatively large FD sets and is able to accurately capture genuine FDs.}, language = {en} } @article{KoumarelasPapenbrockNaumann2020, author = {Koumarelas, Ioannis and Papenbrock, Thorsten and Naumann, Felix}, title = {MDedup}, series = {Proceedings of the VLDB Endowment}, volume = {13}, journal = {Proceedings of the VLDB Endowment}, number = {5}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3377369.3377379}, pages = {712 -- 725}, year = {2020}, abstract = {Duplicate detection is an integral part of data cleaning and serves to identify multiple representations of same real-world entities in (relational) datasets. Existing duplicate detection approaches are effective, but they are also hard to parameterize or require a lot of pre-labeled training data. Both parameterization and pre-labeling are at least domain-specific if not dataset-specific, which is a problem if a new dataset needs to be cleaned. For this reason, we propose a novel, rule-based and fully automatic duplicate detection approach that is based on matching dependencies (MDs). Our system uses automatically discovered MDs, various dataset features, and known gold standards to train a model that selects MDs as duplicate detection rules. Once trained, the model can select useful MDs for duplicate detection on any new dataset. To increase the generally low recall of MD-based data cleaning approaches, we propose an additional boosting step. Our experiments show that this approach reaches up to 94\% F-measure and 100\% precision on our evaluation datasets, which are good numbers considering that the system does not require domain or target data-specific configuration.}, language = {en} } @article{KoumarelasJiangNaumann2020, author = {Koumarelas, Ioannis and Jiang, Lan and Naumann, Felix}, title = {Data preparation for duplicate detection}, series = {Journal of data and information quality : (JDIQ)}, volume = {12}, journal = {Journal of data and information quality : (JDIQ)}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3377878}, pages = {24}, year = {2020}, abstract = {Data errors represent a major issue in most application workflows. Before any important task can take place, a certain data quality has to be guaranteed by eliminating a number of different errors that may appear in data. Typically, most of these errors are fixed with data preparation methods, such as whitespace removal. However, the particular error of duplicate records, where multiple records refer to the same entity, is usually eliminated independently with specialized techniques. Our work is the first to bring these two areas together by applying data preparation operations under a systematic approach prior to performing duplicate detection.
Our process workflow can be summarized as follows: It begins with the user providing as input a sample of the gold standard, the actual dataset, and optionally some constraints to domain-specific data preparations, such as address normalization. The preparation selection operates in two consecutive phases. First, to vastly reduce the search space of ineffective data preparations, decisions are made based on the improvement or worsening of pair similarities. Second, using the remaining data preparations an iterative leave-one-out classification process removes preparations one by one and determines the redundant preparations based on the achieved area under the precision-recall curve (AUC-PR). Using this workflow, we manage to improve the results of duplicate detection up to 19\% in AUC-PR.}, language = {en} } @book{DraisbachNaumannSzottetal.2012, author = {Draisbach, Uwe and Naumann, Felix and Szott, Sascha and Wonneberg, Oliver}, title = {Adaptive windows for duplicate detection}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-143-1}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53007}, publisher = {Universit{\"a}t Potsdam}, pages = {41}, year = {2012}, abstract = {Duplicate detection is the task of identifying all groups of records within a data set that represent the same real-world entity, respectively. This task is difficult, because (i) representations might differ slightly, so some similarity measure must be defined to compare pairs of records and (ii) data sets might have a high volume making a pair-wise comparison of all records infeasible. To tackle the second problem, many algorithms have been suggested that partition the data set and compare all record pairs only within each partition. One well-known such approach is the Sorted Neighborhood Method (SNM), which sorts the data according to some key and then advances a window over the data comparing only records that appear within the same window. We propose several variations of SNM that have in common a varying window size and advancement. The general intuition of such adaptive windows is that there might be regions of high similarity suggesting a larger window size and regions of lower similarity suggesting a smaller window size. We propose and thoroughly evaluate several adaption strategies, some of which are provably better than the original SNM in terms of efficiency (same results with fewer comparisons).}, language = {en} } @article{KoumarelasKroschkMosleyetal.2018, author = {Koumarelas, Ioannis and Kroschk, Axel and Mosley, Clifford and Naumann, Felix}, title = {Experience: Enhancing address matching with geocoding and similarity measure selection}, series = {Journal of Data and Information Quality}, volume = {10}, journal = {Journal of Data and Information Quality}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3232852}, pages = {1 -- 16}, year = {2018}, abstract = {Given a query record, record matching is the problem of finding database records that represent the same real-world object. In the easiest scenario, a database record is completely identical to the query. However, in most cases, problems do arise, for instance, as a result of data errors or data integrated from multiple sources or received from restrictive form fields. These problems are usually difficult, because they require a variety of actions, including field segmentation, decoding of values, and similarity comparisons, each requiring some domain knowledge. In this article, we study the problem of matching records that contain address information, including attributes such as Street-address and City. To facilitate this matching process, we propose a domain-specific procedure to, first, enrich each record with a more complete representation of the address information through geocoding and reverse-geocoding and, second, to select the best similarity measure per each address attribute that will finally help the classifier to achieve the best f-measure. We report on our experience in selecting geocoding services and discovering similarity measures for a concrete but common industry use-case.}, language = {en} } @misc{LosterNaumannEhmuelleretal.2018, author = {Loster, Michael and Naumann, Felix and Ehmueller, Jan and Feldmann, Benjamin}, title = {CurEx}, series = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, journal = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, publisher = {Association for Computing Machinery}, address = {New York}, isbn = {978-1-4503-6014-2}, doi = {10.1145/3269206.3269229}, pages = {1883 -- 1886}, year = {2018}, abstract = {The integration of diverse structured and unstructured information sources into a unified, domain-specific knowledge base is an important task in many areas. A well-maintained knowledge base enables data analysis in complex scenarios, such as risk analysis in the financial sector or investigating large data leaks, such as the Paradise or Panama papers. Both the creation of such knowledge bases, as well as their continuous maintenance and curation involves many complex tasks and considerable manual effort. With CurEx, we present a modular system that allows structured and unstructured data sources to be integrated into a domain-specific knowledge base. In particular, we (i) enable the incremental improvement of each individual integration component; (ii) enable the selective generation of multiple knowledge graphs from the information contained in the knowledge base; and (iii) provide two distinct user interfaces tailored to the needs of data engineers and end-users respectively. The former has curation capabilities and controls the integration process, whereas the latter focuses on the exploration of the generated knowledge graph.}, language = {en} } @article{HameedNaumann2020, author = {Hameed, Mazhar and Naumann, Felix}, title = {Data Preparation}, series = {SIGMOD record}, volume = {49}, journal = {SIGMOD record}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0163-5808}, doi = {10.1145/3444831.3444835}, pages = {18 -- 29}, year = {2020}, abstract = {Raw data are often messy: they follow different encodings, records are not well structured, values do not adhere to patterns, etc. Such data are in general not fit to be ingested by downstream applications, such as data analytics tools, or even by data management systems. The act of obtaining information from raw data relies on some data preparation process. Data preparation is integral to advanced data analysis and data management, not only for data science but for any data-driven applications. Existing data preparation tools are operational and useful, but there is still room for improvement and optimization. With increasing data volume and its messy nature, the demand for prepared data increases day by day.
To cater to this demand, companies and researchers are developing techniques and tools for data preparation. To better understand the available data preparation systems, we have conducted a survey to investigate (1) prominent data preparation tools, (2) distinctive tool features, (3) the need for preliminary data processing even for these tools and, (4) features and abilities that are still lacking. We conclude with an argument in support of automatic and intelligent data preparation beyond traditional and simplistic techniques.}, language = {en} } @article{JiangNaumann2020, author = {Jiang, Lan and Naumann, Felix}, title = {Holistic primary key and foreign key detection}, series = {Journal of intelligent information systems : JIIS}, volume = {54}, journal = {Journal of intelligent information systems : JIIS}, number = {3}, publisher = {Springer}, address = {Dordrecht}, issn = {0925-9902}, doi = {10.1007/s10844-019-00562-z}, pages = {439 -- 461}, year = {2020}, abstract = {Primary keys (PKs) and foreign keys (FKs) are important elements of relational schemata in various applications, such as query optimization and data integration. However, in many cases, these constraints are unknown or not documented. Detecting them manually is time-consuming and even infeasible in large-scale datasets. We study the problem of discovering primary keys and foreign keys automatically and propose an algorithm to detect both, namely Holistic Primary Key and Foreign Key Detection (HoPF). PKs and FKs are subsets of the sets of unique column combinations (UCCs) and inclusion dependencies (INDs), respectively, for which efficient discovery algorithms are known. Using score functions, our approach is able to effectively extract the true PKs and FKs from the vast sets of valid UCCs and INDs. Several pruning rules are employed to speed up the procedure. We evaluate precision and recall on three benchmarks and two real-world datasets. The results show that our method is able to retrieve on average 88\% of all primary keys, and 91\% of all foreign keys. We compare the performance of HoPF with two baseline approaches that both assume the existence of primary keys.}, language = {en} } @article{SchirmerPapenbrockKoumarelasetal.2020, author = {Schirmer, Philipp and Papenbrock, Thorsten and Koumarelas, Ioannis and Naumann, Felix}, title = {Efficient discovery of matching dependencies}, series = {ACM transactions on database systems : TODS}, volume = {45}, journal = {ACM transactions on database systems : TODS}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0362-5915}, doi = {10.1145/3392778}, pages = {33}, year = {2020}, abstract = {Matching dependencies (MDs) are data profiling results that are often used for data integration, data cleaning, and entity matching. They are a generalization of functional dependencies (FDs) matching similar rather than same elements. As their discovery is very difficult, existing profiling algorithms find either only small subsets of all MDs or their scope is limited to only small datasets. We focus on the efficient discovery of all interesting MDs in real-world datasets. For this purpose, we propose HyMD, a novel MD discovery algorithm that finds all minimal, non-trivial MDs within given similarity boundaries. The algorithm extracts the exact similarity thresholds for the individual MDs from the data instead of using predefined similarity thresholds. For this reason, it is the first approach to solve the MD discovery problem in an exact and truly complete way. If needed, the algorithm can, however, enforce certain properties on the reported MDs, such as disjointness and minimum support, to focus the discovery on such results that are actually required by downstream use cases. HyMD is technically a hybrid approach that combines the two most popular dependency discovery strategies in related work: lattice traversal and inference from record pairs. Despite the additional effort of finding exact similarity thresholds for all MD candidates, the algorithm is still able to efficiently process large datasets, e.g., datasets larger than 3 GB.}, language = {en} }