@article{DraisbachChristenNaumann2019, author = {Draisbach, Uwe and Christen, Peter and Naumann, Felix}, title = {Transforming pairwise duplicates to entity clusters for high-quality duplicate detection}, series = {ACM Journal of Data and Information Quality}, volume = {12}, journal = {ACM Journal of Data and Information Quality}, number = {1}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3352591}, pages = {1 -- 30}, year = {2019}, abstract = {Duplicate detection algorithms produce clusters of database records, each cluster representing a single real-world entity. As most of these algorithms use pairwise comparisons, the resulting (transitive) clusters can be inconsistent: Not all records within a cluster are sufficiently similar to be classified as duplicate. Thus, one of many subsequent clustering algorithms can further improve the result.
We explain in detail, compare, and evaluate many of these algorithms and introduce three new clustering algorithms in the specific context of duplicate detection. Two of our three new algorithms use the structure of the input graph to create consistent clusters. Our third algorithm, and many other clustering algorithms, focus on the edge weights, instead. For evaluation, in contrast to related work, we experiment on true real-world datasets, and in addition examine in great detail various pair-selection strategies used in practice. While no overall winner emerges, we are able to identify best approaches for different situations. In scenarios with larger clusters, our proposed algorithm, Extended Maximum Clique Clustering (EMCC), and Markov Clustering show the best results. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.}, language = {en} } @article{SchirmerPapenbrockKoumarelasetal.2020, author = {Schirmer, Philipp and Papenbrock, Thorsten and Koumarelas, Ioannis and Naumann, Felix}, title = {Efficient discovery of matching dependencies}, series = {ACM transactions on database systems : TODS}, volume = {45}, journal = {ACM transactions on database systems : TODS}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0362-5915}, doi = {10.1145/3392778}, pages = {33}, year = {2020}, abstract = {Matching dependencies (MDs) are data profiling results that are often used for data integration, data cleaning, and entity matching. They are a generalization of functional dependencies (FDs) matching similar rather than same elements. As their discovery is very difficult, existing profiling algorithms find either only small subsets of all MDs or their scope is limited to only small datasets. We focus on the efficient discovery of all interesting MDs in real-world datasets. For this purpose, we propose HyMD, a novel MD discovery algorithm that finds all minimal, non-trivial MDs within given similarity boundaries. The algorithm extracts the exact similarity thresholds for the individual MDs from the data instead of using predefined similarity thresholds. For this reason, it is the first approach to solve the MD discovery problem in an exact and truly complete way. If needed, the algorithm can, however, enforce certain properties on the reported MDs, such as disjointness and minimum support, to focus the discovery on such results that are actually required by downstream use cases. HyMD is technically a hybrid approach that combines the two most popular dependency discovery strategies in related work: lattice traversal and inference from record pairs. Despite the additional effort of finding exact similarity thresholds for all MD candidates, the algorithm is still able to efficiently process large datasets, e.g., datasets larger than 3 GB.}, language = {en} } @phdthesis{Draisbach2022, author = {Draisbach, Uwe}, title = {Efficient duplicate detection and the impact of transitivity}, doi = {10.25932/publishup-57214}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-572140}, school = {Universit{\"a}t Potsdam}, pages = {x, 150}, year = {2022}, abstract = {Duplicate detection describes the process of finding multiple representations of the same real-world entity in the absence of a unique identifier, and has many application areas, such as customer relationship management, genealogy and social sciences, or online shopping. Due to the increasing amount of data in recent years, the problem has become even more challenging on the one hand, but has led to a renaissance in duplicate detection research on the other hand. This thesis examines the effects and opportunities of transitive relationships on the duplicate detection process. Transitivity implies that if record pairs ⟨ri,rj⟩ and ⟨rj,rk⟩ are classified as duplicates, then also record pair ⟨ri,rk⟩ has to be a duplicate. However, this reasoning might contradict with the pairwise classification, which is usually based on the similarity of objects. An essential property of similarity, in contrast to equivalence, is that similarity is not necessarily transitive. First, we experimentally evaluate the effect of an increasing data volume on the threshold selection to classify whether a record pair is a duplicate or non-duplicate. Our experiments show that independently of the pair selection algorithm and the used similarity measure, selecting a suitable threshold becomes more difficult with an increasing number of records due to an increased probability of adding a false duplicate to an existing cluster. Thus, the best threshold changes with the dataset size, and a good threshold for a small (possibly sampled) dataset is not necessarily a good threshold for a larger (possibly complete) dataset. As data grows over time, earlier selected thresholds are no longer a suitable choice, and the problem becomes worse for datasets with larger clusters. Second, we present with the Duplicate Count Strategy (DCS) and its enhancement DCS++ two alternatives to the standard Sorted Neighborhood Method (SNM) for the selection of candidate record pairs. DCS adapts SNMs window size based on the number of detected duplicates and DCS++ uses transitive dependencies to save complex comparisons for finding duplicates in larger clusters. We prove that with a proper (domain- and data-independent!) threshold, DCS++ is more efficient than SNM without loss of effectiveness. Third, we tackle the problem of contradicting pairwise classifications. Usually, the transitive closure is used for pairwise classifications to obtain a transitively closed result set. However, the transitive closure disregards negative classifications. We present three new and several existing clustering algorithms and experimentally evaluate them on various datasets and under various algorithm configurations. The results show that the commonly used transitive closure is inferior to most other clustering algorithms, especially for the precision of results. In scenarios with larger clusters, our proposed EMCC algorithm is, together with Markov Clustering, the best performing clustering approach for duplicate detection, although its runtime is longer than Markov Clustering due to the subexponential time complexity. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.}, language = {en} } @phdthesis{Loster2021, author = {Loster, Michael}, title = {Knowledge base construction with machine learning methods}, doi = {10.25932/publishup-50145}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-501459}, school = {Universit{\"a}t Potsdam}, pages = {ii, 130}, year = {2021}, abstract = {Modern knowledge bases contain and organize knowledge from many different topic areas. Apart from specific entity information, they also store information about their relationships amongst each other. Combining this information results in a knowledge graph that can be particularly helpful in cases where relationships are of central importance. Among other applications, modern risk assessment in the financial sector can benefit from the inherent network structure of such knowledge graphs by assessing the consequences and risks of certain events, such as corporate insolvencies or fraudulent behavior, based on the underlying network structure. As public knowledge bases often do not contain the necessary information for the analysis of such scenarios, the need arises to create and maintain dedicated domain-specific knowledge bases. This thesis investigates the process of creating domain-specific knowledge bases from structured and unstructured data sources. In particular, it addresses the topics of named entity recognition (NER), duplicate detection, and knowledge validation, which represent essential steps in the construction of knowledge bases. As such, we present a novel method for duplicate detection based on a Siamese neural network that is able to learn a dataset-specific similarity measure which is used to identify duplicates. Using the specialized network architecture, we design and implement a knowledge transfer between two deduplication networks, which leads to significant performance improvements and a reduction of required training data. Furthermore, we propose a named entity recognition approach that is able to identify company names by integrating external knowledge in the form of dictionaries into the training process of a conditional random field classifier. In this context, we study the effects of different dictionaries on the performance of the NER classifier. We show that both the inclusion of domain knowledge as well as the generation and use of alias names results in significant performance improvements. For the validation of knowledge represented in a knowledge base, we introduce Colt, a framework for knowledge validation based on the interactive quality assessment of logical rules. In its most expressive implementation, we combine Gaussian processes with neural networks to create Colt-GP, an interactive algorithm for learning rule models. Unlike other approaches, Colt-GP uses knowledge graph embeddings and user feedback to cope with data quality issues of knowledge bases. The learned rule model can be used to conditionally apply a rule and assess its quality. Finally, we present CurEx, a prototypical system for building domain-specific knowledge bases from structured and unstructured data sources. Its modular design is based on scalable technologies, which, in addition to processing large datasets, ensures that the modules can be easily exchanged or extended. CurEx offers multiple user interfaces, each tailored to the individual needs of a specific user group and is fully compatible with the Colt framework, which can be used as part of the system. We conduct a wide range of experiments with different datasets to determine the strengths and weaknesses of the proposed methods. To ensure the validity of our results, we compare the proposed methods with competing approaches.}, language = {en} } @phdthesis{Koumarelas2020, author = {Koumarelas, Ioannis}, title = {Data preparation and domain-agnostic duplicate detection}, doi = {10.25932/publishup-48913}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-489131}, school = {Universit{\"a}t Potsdam}, pages = {x, 97}, year = {2020}, abstract = {Successfully completing any data science project demands careful consideration across its whole process. Although the focus is often put on later phases of the process, in practice, experts spend more time in earlier phases, preparing data, to make them consistent with the systems' requirements or to improve their models' accuracies. Duplicate detection is typically applied during the data cleaning phase, which is dedicated to removing data inconsistencies and improving the overall quality and usability of data. While data cleaning involves a plethora of approaches to perform specific operations, such as schema alignment and data normalization, the task of detecting and removing duplicate records is particularly challenging. Duplicates arise when multiple records representing the same entities exist in a database. Due to numerous reasons, spanning from simple typographical errors to different schemas and formats of integrated databases. Keeping a database free of duplicates is crucial for most use-cases, as their existence causes false negatives and false positives when matching queries against it. These two data quality issues have negative implications for tasks, such as hotel booking, where users may erroneously select a wrong hotel, or parcel delivery, where a parcel can get delivered to the wrong address. Identifying the variety of possible data issues to eliminate duplicates demands sophisticated approaches. While research in duplicate detection is well-established and covers different aspects of both efficiency and effectiveness, our work in this thesis focuses on the latter. We propose novel approaches to improve data quality before duplicate detection takes place and apply the latter in datasets even when prior labeling is not available. Our experiments show that improving data quality upfront can increase duplicate classification results by up to 19\%. To this end, we propose two novel pipelines that select and apply generic as well as address-specific data preparation steps with the purpose of maximizing the success of duplicate detection. Generic data preparation, such as the removal of special characters, can be applied to any relation with alphanumeric attributes. When applied, data preparation steps are selected only for attributes where there are positive effects on pair similarities, which indirectly affect classification, or on classification directly. Our work on addresses is twofold; first, we consider more domain-specific approaches to improve the quality of values, and, second, we experiment with known and modified versions of similarity measures to select the most appropriate per address attribute, e.g., city or country. To facilitate duplicate detection in applications where gold standard annotations are not available and obtaining them is not possible or too expensive, we propose MDedup. MDedup is a novel, rule-based, and fully automatic duplicate detection approach that is based on matching dependencies. These dependencies can be used to detect duplicates and can be discovered using state-of-the-art algorithms efficiently and without any prior labeling. MDedup uses two pipelines to first train on datasets with known labels, learning to identify useful matching dependencies, and then be applied on unseen datasets, regardless of any existing gold standard. Finally, our work is accompanied by open source code to enable repeatability of our research results and application of our approaches to other datasets.}, language = {en} }