@article{KoumarelasJiangNaumann2020, author = {Koumarelas, Ioannis and Jiang, Lan and Naumann, Felix}, title = {Data preparation for duplicate detection}, series = {Journal of data and information quality : (JDIQ)}, volume = {12}, journal = {Journal of data and information quality : (JDIQ)}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3377878}, pages = {24}, year = {2020}, abstract = {Data errors represent a major issue in most application workflows. Before any important task can take place, a certain data quality has to be guaranteed by eliminating a number of different errors that may appear in data. Typically, most of these errors are fixed with data preparation methods, such as whitespace removal. However, the particular error of duplicate records, where multiple records refer to the same entity, is usually eliminated independently with specialized techniques. Our work is the first to bring these two areas together by applying data preparation operations under a systematic approach prior to performing duplicate detection.
Our process workflow can be summarized as follows: It begins with the user providing as input a sample of the gold standard, the actual dataset, and optionally some constraints to domain-specific data preparations, such as address normalization. The preparation selection operates in two consecutive phases. First, to vastly reduce the search space of ineffective data preparations, decisions are made based on the improvement or worsening of pair similarities. Second, using the remaining data preparations an iterative leave-one-out classification process removes preparations one by one and determines the redundant preparations based on the achieved area under the precision-recall curve (AUC-PR). Using this workflow, we manage to improve the results of duplicate detection up to 19\% in AUC-PR.}, language = {en} } @article{KoumarelasKroschkMosleyetal.2018, author = {Koumarelas, Ioannis and Kroschk, Axel and Mosley, Clifford and Naumann, Felix}, title = {Experience: Enhancing address matching with geocoding and similarity measure selection}, series = {Journal of Data and Information Quality}, volume = {10}, journal = {Journal of Data and Information Quality}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3232852}, pages = {1 -- 16}, year = {2018}, abstract = {Given a query record, record matching is the problem of finding database records that represent the same real-world object. In the easiest scenario, a database record is completely identical to the query. However, in most cases, problems do arise, for instance, as a result of data errors or data integrated from multiple sources or received from restrictive form fields. These problems are usually difficult, because they require a variety of actions, including field segmentation, decoding of values, and similarity comparisons, each requiring some domain knowledge. In this article, we study the problem of matching records that contain address information, including attributes such as Street-address and City. To facilitate this matching process, we propose a domain-specific procedure to, first, enrich each record with a more complete representation of the address information through geocoding and reverse-geocoding and, second, to select the best similarity measure per each address attribute that will finally help the classifier to achieve the best f-measure. We report on our experience in selecting geocoding services and discovering similarity measures for a concrete but common industry use-case.}, language = {en} } @article{LosterKoumarelasNaumann2021, author = {Loster, Michael and Koumarelas, Ioannis and Naumann, Felix}, title = {Knowledge transfer for entity resolution with siamese neural networks}, series = {ACM journal of data and information quality}, volume = {13}, journal = {ACM journal of data and information quality}, number = {1}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3410157}, pages = {25}, year = {2021}, abstract = {The integration of multiple data sources is a common problem in a large variety of applications. Traditionally, handcrafted similarity measures are used to discover, merge, and integrate multiple representations of the same entity-duplicates-into a large homogeneous collection of data. Often, these similarity measures do not cope well with the heterogeneity of the underlying dataset. In addition, domain experts are needed to manually design and configure such measures, which is both time-consuming and requires extensive domain expertise.
We propose a deep Siamese neural network, capable of learning a similarity measure that is tailored to the characteristics of a particular dataset. With the properties of deep learning methods, we are able to eliminate the manual feature engineering process and thus considerably reduce the effort required for model construction. In addition, we show that it is possible to transfer knowledge acquired during the deduplication of one dataset to another, and thus significantly reduce the amount of data required to train a similarity measure. We evaluated our method on multiple datasets and compare our approach to state-of-the-art deduplication methods. Our approach outperforms competitors by up to +26 percent F-measure, depending on task and dataset. In addition, we show that knowledge transfer is not only feasible, but in our experiments led to an improvement in F-measure of up to +4.7 percent.}, language = {en} }