@article{VitaglianoHameedJiangetal.2023,
  author    = {Vitagliano, Gerardo and Hameed, Mazhar and Jiang, Lan and Reisener, Lucas and Wu, Eugene and Naumann, Felix},
  title     = {Pollock: a data loading benchmark},
  series = {Proceedings of the VLDB Endowment},
  volume    = {16},
  journal   = {Proceedings of the VLDB Endowment},
  number    = {8},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {2150-8097},
  doi       = {10.14778/3594512.3594518},
  pages     = {1870 -- 1882},
  year      = {2023},
  abstract  = {Any system at play in a data-driven project has a fundamental requirement: the ability to load data. The de-facto standard format to distribute and consume raw data is CSV. Yet, the plain text and flexible nature of this format make such files often difficult to parse and correctly load their content, requiring cumbersome data preparation steps. We propose a benchmark to assess the robustness of systems in loading data from non-standard CSV formats and with structural inconsistencies. First, we formalize a model to describe the issues that affect real-world files and use it to derive a systematic lpollutionz process to generate dialects for any given grammar. Our benchmark leverages the pollution framework for the csv format. To guide pollution, we have surveyed thousands of real-world, publicly available csv files, recording the problems we encountered. We demonstrate the applicability of our benchmark by testing and scoring 16 different systems: popular csv parsing frameworks, relational database tools, spreadsheet systems, and a data visualization tool.},
  language  = {en}
}
@article{BonifatiMiorNaumannetal.2022,
  author    = {Bonifati, Angela and Mior, Michael J. and Naumann, Felix and Noack, Nele Sina},
  title     = {How inclusive are we?},
  series = {SIGMOD record / Association for Computing Machinery, Special Interest Group on Management of Data},
  volume    = {50},
  journal   = {SIGMOD record / Association for Computing Machinery, Special Interest Group on Management of Data},
  number    = {4},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {0163-5808},
  doi       = {10.1145/3516431.3516438},
  pages     = {30 -- 35},
  year      = {2022},
  abstract  = {ACM SIGMOD, VLDB and other database organizations have committed to fostering an inclusive and diverse community, as do many other scientific organizations. Recently, different measures have been taken to advance these goals, especially for underrepresented groups. One possible measure is double-blind reviewing, which aims to hide gender, ethnicity, and other properties of the authors. <br /> We report the preliminary results of a gender diversity analysis of publications of the database community across several peer-reviewed venues, and also compare women's authorship percentages in both single-blind and double-blind venues along the years. We also obtained a cross comparison of the obtained results in data management with other relevant areas in Computer Science.},
  language  = {en}
}
@article{CaruccioDeufemiaNaumannetal.2021,
  author    = {Caruccio, Loredana and Deufemia, Vincenzo and Naumann, Felix and Polese, Giuseppe},
  title     = {Discovering relaxed functional dependencies based on multi-attribute dominance},
  series = {IEEE transactions on knowledge and data engineering},
  volume    = {33},
  journal   = {IEEE transactions on knowledge and data engineering},
  number    = {9},
  publisher = {Institute of Electrical and Electronics Engineers},
  address   = {New York, NY},
  issn      = {1041-4347},
  doi       = {10.1109/TKDE.2020.2967722},
  pages     = {3212 -- 3228},
  year      = {2021},
  abstract  = {With the advent of big data and data lakes, data are often integrated from multiple sources. Such integrated data are often of poor quality, due to inconsistencies, errors, and so forth. One way to check the quality of data is to infer functional dependencies (fds). However, in many modern applications it might be necessary to extract properties and relationships that are not captured through fds, due to the necessity to admit exceptions, or to consider similarity rather than equality of data values. Relaxed fds (rfds) have been introduced to meet these needs, but their discovery from data adds further complexity to an already complex problem, also due to the necessity of specifying similarity and validity thresholds. We propose Domino, a new discovery algorithm for rfds that exploits the concept of dominance in order to derive similarity thresholds of attribute values while inferring rfds. An experimental evaluation on real datasets demonstrates the discovery performance and the effectiveness of the proposed algorithm.},
  language  = {en}
}
@article{KossmannPapenbrockNaumann2021,
  author    = {Koßmann, Jan and Papenbrock, Thorsten and Naumann, Felix},
  title     = {Data dependencies for query optimization},
  series = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment},
  volume    = {31},
  journal   = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment},
  number    = {1},
  publisher = {Springer},
  address   = {Berlin ; Heidelberg ; New York},
  issn      = {1066-8888},
  doi       = {10.1007/s00778-021-00676-3},
  pages     = {1 -- 22},
  year      = {2021},
  abstract  = {Effective query optimization is a core feature of any database management system. While most query optimization techniques make use of simple metadata, such as cardinalities and other basic statistics, other optimization techniques are based on more advanced metadata including data dependencies, such as functional, uniqueness, order, or inclusion dependencies. This survey provides an overview, intuitive descriptions, and classifications of query optimization and execution strategies that are enabled by data dependencies. We consider the most popular types of data dependencies and focus on optimization strategies that target the optimization of relational database queries. The survey supports database vendors to identify optimization opportunities as well as DBMS researchers to find related work and open research questions.},
  language  = {en}
}
@article{VitaglianoJiangNaumann2021,
  author    = {Vitagliano, Gerardo and Jiang, Lan and Naumann, Felix},
  title     = {Detecting layout templates in complex multiregion files},
  series = {Proceedings of the VLDB Endowment},
  volume    = {15},
  journal   = {Proceedings of the VLDB Endowment},
  number    = {3},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {2150-8097},
  doi       = {10.14778/3494124.3494145},
  pages     = {646 -- 658},
  year      = {2021},
  abstract  = {Spreadsheets are among the most commonly used file formats for data management, distribution, and analysis. Their widespread employment makes it easy to gather large collections of data, but their flexible canvas-based structure makes automated analysis difficult without heavy preparation. One of the common problems that practitioners face is the presence of multiple, independent regions in a single spreadsheet, possibly separated by repeated empty cells. We define such files as "multiregion" files. In collections of various spreadsheets, we can observe that some share the same layout. We present the Mondrian approach to automatically identify layout templates across multiple files and systematically extract the corresponding regions. Our approach is composed of three phases: first, each file is rendered as an image and inspected for elements that could form regions; then, using a clustering algorithm, the identified elements are grouped to form regions; finally, every file layout is represented as a graph and compared with others to find layout templates. We compare our method to state-of-the-art table recognition algorithms on two corpora of real-world enterprise spreadsheets. Our approach shows the best performances in detecting reliable region boundaries within each file and can correctly identify recurring layouts across files.},
  language  = {en}
}
@article{LosterKoumarelasNaumann2021,
  author    = {Loster, Michael and Koumarelas, Ioannis and Naumann, Felix},
  title     = {Knowledge transfer for entity resolution with siamese neural networks},
  series = {ACM journal of data and information quality},
  volume    = {13},
  journal   = {ACM journal of data and information quality},
  number    = {1},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {1936-1955},
  doi       = {10.1145/3410157},
  pages     = {25},
  year      = {2021},
  abstract  = {The integration of multiple data sources is a common problem in a large variety of applications. Traditionally, handcrafted similarity measures are used to discover, merge, and integrate multiple representations of the same entity-duplicates-into a large homogeneous collection of data. Often, these similarity measures do not cope well with the heterogeneity of the underlying dataset. In addition, domain experts are needed to manually design and configure such measures, which is both time-consuming and requires extensive domain expertise. <br /> We propose a deep Siamese neural network, capable of learning a similarity measure that is tailored to the characteristics of a particular dataset. With the properties of deep learning methods, we are able to eliminate the manual feature engineering process and thus considerably reduce the effort required for model construction. In addition, we show that it is possible to transfer knowledge acquired during the deduplication of one dataset to another, and thus significantly reduce the amount of data required to train a similarity measure. We evaluated our method on multiple datasets and compare our approach to state-of-the-art deduplication methods. Our approach outperforms competitors by up to +26 percent F-measure, depending on task and dataset. In addition, we show that knowledge transfer is not only feasible, but in our experiments led to an improvement in F-measure of up to +4.7 percent.},
  language  = {en}
}
@article{BonnetDongNaumannetal.2021,
  author    = {Bonnet, Philippe and Dong, Xin Luna and Naumann, Felix and T{\"o}z{\"u}n, P{\i}nar},
  title     = {VLDB 2021},
  series = {SIGMOD record},
  volume    = {50},
  journal   = {SIGMOD record},
  number    = {4},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {0163-5808},
  doi       = {10.1145/3516431.3516447},
  pages     = {50 -- 53},
  year      = {2021},
  abstract  = {The 47th International Conference on Very Large Databases (VLDB'21) was held on August 16-20, 2021 as a hybrid conference. It attracted 180 in-person attendees in Copenhagen and 840 remote attendees. In this paper, we describe our key decisions as general chairs and program committee chairs and share the lessons we learned.},
  language  = {en}
}
@book{MeinelDoellnerWeskeetal.2021,
  author    = {Meinel, Christoph and D{\"o}llner, J{\"u}rgen Roland Friedrich and Weske, Mathias and Polze, Andreas and Hirschfeld, Robert and Naumann, Felix and Giese, Holger and Baudisch, Patrick and Friedrich, Tobias and B{\"o}ttinger, Erwin and Lippert, Christoph and D{\"o}rr, Christian and Lehmann, Anja and Renard, Bernhard and Rabl, Tilmann and Uebernickel, Falk and Arnrich, Bert and H{\"o}lzle, Katharina},
  title     = {Proceedings of the HPI Research School on Service-oriented Systems Engineering 2020 Fall Retreat},
  number    = {138},
  publisher = {Universit{\"a}tsverlag Potsdam},
  address   = {Potsdam},
  isbn      = {978-3-86956-513-2},
  issn      = {1613-5652},
  doi       = {10.25932/publishup-50413},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-504132},
  publisher      = {Universit{\"a}t Potsdam},
  pages     = {vi, 144},
  year      = {2021},
  abstract  = {Design and Implementation of service-oriented architectures imposes a huge number of research questions from the fields of software engineering, system analysis and modeling, adaptability, and application integration. Component orientation and web services are two approaches for design and realization of complex web-based system. Both approaches allow for dynamic application adaptation as well as integration of enterprise application. Service-Oriented Systems Engineering represents a symbiosis of best practices in object-orientation, component-based development, distributed computing, and business process management. It provides integration of business and IT concerns. The annual Ph.D. Retreat of the Research School provides each member the opportunity to present his/her current state of their research and to give an outline of a prospective Ph.D. thesis. Due to the interdisciplinary structure of the research school, this technical report covers a wide range of topics. These include but are not limited to: Human Computer Interaction and Computer Vision as Service; Service-oriented Geovisualization Systems; Algorithm Engineering for Service-oriented Systems; Modeling and Verification of Self-adaptive Service-oriented Systems; Tools and Methods for Software Engineering in Service-oriented Systems; Security Engineering of Service-based IT Systems; Service-oriented Information Systems; Evolutionary Transition of Enterprise Applications to Service Orientation; Operating System Abstractions for Service-oriented Computing; and Services Specification, Composition, and Enactment.},
  language  = {en}
}
@article{KoumarelasPapenbrockNaumann2020,
  author    = {Koumarelas, Ioannis and Papenbrock, Thorsten and Naumann, Felix},
  title     = {MDedup},
  series = {Proceedings of the VLDB Endowment},
  volume    = {13},
  journal   = {Proceedings of the VLDB Endowment},
  number    = {5},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {2150-8097},
  doi       = {10.14778/3377369.3377379},
  pages     = {712 -- 725},
  year      = {2020},
  abstract  = {Duplicate detection is an integral part of data cleaning and serves to identify multiple representations of same real-world entities in (relational) datasets. Existing duplicate detection approaches are effective, but they are also hard to parameterize or require a lot of pre-labeled training data. Both parameterization and pre-labeling are at least domain-specific if not dataset-specific, which is a problem if a new dataset needs to be cleaned. For this reason, we propose a novel, rule-based and fully automatic duplicate detection approach that is based on matching dependencies (MDs). Our system uses automatically discovered MDs, various dataset features, and known gold standards to train a model that selects MDs as duplicate detection rules. Once trained, the model can select useful MDs for duplicate detection on any new dataset. To increase the generally low recall of MD-based data cleaning approaches, we propose an additional boosting step. Our experiments show that this approach reaches up to 94\% F-measure and 100\% precision on our evaluation datasets, which are good numbers considering that the system does not require domain or target data-specific configuration.},
  language  = {en}
}
@article{KoumarelasJiangNaumann2020,
  author    = {Koumarelas, Ioannis and Jiang, Lan and Naumann, Felix},
  title     = {Data preparation for duplicate detection},
  series = {Journal of data and information quality : (JDIQ)},
  volume    = {12},
  journal   = {Journal of data and information quality : (JDIQ)},
  number    = {3},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {1936-1955},
  doi       = {10.1145/3377878},
  pages     = {24},
  year      = {2020},
  abstract  = {Data errors represent a major issue in most application workflows. Before any important task can take place, a certain data quality has to be guaranteed by eliminating a number of different errors that may appear in data. Typically, most of these errors are fixed with data preparation methods, such as whitespace removal. However, the particular error of duplicate records, where multiple records refer to the same entity, is usually eliminated independently with specialized techniques. Our work is the first to bring these two areas together by applying data preparation operations under a systematic approach prior to performing duplicate detection. <br /> Our process workflow can be summarized as follows: It begins with the user providing as input a sample of the gold standard, the actual dataset, and optionally some constraints to domain-specific data preparations, such as address normalization. The preparation selection operates in two consecutive phases. First, to vastly reduce the search space of ineffective data preparations, decisions are made based on the improvement or worsening of pair similarities. Second, using the remaining data preparations an iterative leave-one-out classification process removes preparations one by one and determines the redundant preparations based on the achieved area under the precision-recall curve (AUC-PR). Using this workflow, we manage to improve the results of duplicate detection up to 19\% in AUC-PR.},
  language  = {en}
}
@article{HameedNaumann2020,
  author    = {Hameed, Mazhar and Naumann, Felix},
  title     = {Data Preparation},
  series = {SIGMOD record},
  volume    = {49},
  journal   = {SIGMOD record},
  number    = {3},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {0163-5808},
  doi       = {10.1145/3444831.3444835},
  pages     = {18 -- 29},
  year      = {2020},
  abstract  = {Raw data are often messy: they follow different encodings, records are not well structured, values do not adhere to patterns, etc. Such data are in general not fit to be ingested by downstream applications, such as data analytics tools, or even by data management systems. The act of obtaining information from raw data relies on some data preparation process. Data preparation is integral to advanced data analysis and data management, not only for data science but for any data-driven applications. Existing data preparation tools are operational and useful, but there is still room for improvement and optimization. With increasing data volume and its messy nature, the demand for prepared data increases day by day. <br /> To cater to this demand, companies and researchers are developing techniques and tools for data preparation. To better understand the available data preparation systems, we have conducted a survey to investigate (1) prominent data preparation tools, (2) distinctive tool features, (3) the need for preliminary data processing even for these tools and, (4) features and abilities that are still lacking. We conclude with an argument in support of automatic and intelligent data preparation beyond traditional and simplistic techniques.},
  language  = {en}
}
@article{JiangNaumann2020,
  author    = {Jiang, Lan and Naumann, Felix},
  title     = {Holistic primary key and foreign key detection},
  series = {Journal of intelligent information systems : JIIS},
  volume    = {54},
  journal   = {Journal of intelligent information systems : JIIS},
  number    = {3},
  publisher = {Springer},
  address   = {Dordrecht},
  issn      = {0925-9902},
  doi       = {10.1007/s10844-019-00562-z},
  pages     = {439 -- 461},
  year      = {2020},
  abstract  = {Primary keys (PKs) and foreign keys (FKs) are important elements of relational schemata in various applications, such as query optimization and data integration. However, in many cases, these constraints are unknown or not documented. Detecting them manually is time-consuming and even infeasible in large-scale datasets. We study the problem of discovering primary keys and foreign keys automatically and propose an algorithm to detect both, namely Holistic Primary Key and Foreign Key Detection (HoPF). PKs and FKs are subsets of the sets of unique column combinations (UCCs) and inclusion dependencies (INDs), respectively, for which efficient discovery algorithms are known. Using score functions, our approach is able to effectively extract the true PKs and FKs from the vast sets of valid UCCs and INDs. Several pruning rules are employed to speed up the procedure. We evaluate precision and recall on three benchmarks and two real-world datasets. The results show that our method is able to retrieve on average 88\% of all primary keys, and 91\% of all foreign keys. We compare the performance of HoPF with two baseline approaches that both assume the existence of primary keys.},
  language  = {en}
}
@article{SchirmerPapenbrockKoumarelasetal.2020,
  author    = {Schirmer, Philipp and Papenbrock, Thorsten and Koumarelas, Ioannis and Naumann, Felix},
  title     = {Efficient discovery of matching dependencies},
  series = {ACM transactions on database systems : TODS},
  volume    = {45},
  journal   = {ACM transactions on database systems : TODS},
  number    = {3},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {0362-5915},
  doi       = {10.1145/3392778},
  pages     = {33},
  year      = {2020},
  abstract  = {Matching dependencies (MDs) are data profiling results that are often used for data integration, data cleaning, and entity matching. They are a generalization of functional dependencies (FDs) matching similar rather than same elements. As their discovery is very difficult, existing profiling algorithms find either only small subsets of all MDs or their scope is limited to only small datasets. We focus on the efficient discovery of all interesting MDs in real-world datasets. For this purpose, we propose HyMD, a novel MD discovery algorithm that finds all minimal, non-trivial MDs within given similarity boundaries. The algorithm extracts the exact similarity thresholds for the individual MDs from the data instead of using predefined similarity thresholds. For this reason, it is the first approach to solve the MD discovery problem in an exact and truly complete way. If needed, the algorithm can, however, enforce certain properties on the reported MDs, such as disjointness and minimum support, to focus the discovery on such results that are actually required by downstream use cases. HyMD is technically a hybrid approach that combines the two most popular dependency discovery strategies in related work: lattice traversal and inference from record pairs. Despite the additional effort of finding exact similarity thresholds for all MD candidates, the algorithm is still able to efficiently process large datasets, e.g., datasets larger than 3 GB.},
  language  = {en}
}
@article{HackerKrestelGrundmannetal.2020,
  author    = {Hacker, Philipp and Krestel, Ralf and Grundmann, Stefan and Naumann, Felix},
  title     = {Explainable AI under contract and tort law},
  series = {Artificial intelligence and law},
  volume    = {28},
  journal   = {Artificial intelligence and law},
  number    = {4},
  publisher = {Springer},
  address   = {Dordrecht},
  issn      = {0924-8463},
  doi       = {10.1007/s10506-020-09260-6},
  pages     = {415 -- 439},
  year      = {2020},
  abstract  = {This paper shows that the law, in subtle ways, may set hitherto unrecognized incentives for the adoption of explainable machine learning applications. In doing so, we make two novel contributions. First, on the legal side, we show that to avoid liability, professional actors, such as doctors and managers, may soon be legally compelled to use explainable ML models. We argue that the importance of explainability reaches far beyond data protection law, and crucially influences questions of contractual and tort liability for the use of ML models. To this effect, we conduct two legal case studies, in medical and corporate merger applications of ML. As a second contribution, we discuss the (legally required) trade-off between accuracy and explainability and demonstrate the effect in a technical case study in the context of spam classification.},
  language  = {en}
}
@article{BirnickBlaesiusFriedrichetal.2020,
  author    = {Birnick, Johann and Bl{\"a}sius, Thomas and Friedrich, Tobias and Naumann, Felix and Papenbrock, Thorsten and Schirneck, Friedrich Martin},
  title     = {Hitting set enumeration with partial information for unique column combination discovery},
  series = {Proceedings of the VLDB Endowment},
  volume    = {13},
  journal   = {Proceedings of the VLDB Endowment},
  number    = {11},
  publisher = {Association for Computing Machinery},
  address   = {[New York, NY]},
  issn      = {2150-8097},
  doi       = {10.14778/3407790.3407824},
  pages     = {2270 -- 2283},
  year      = {2020},
  abstract  = {Unique column combinations (UCCs) are a fundamental concept in relational databases. They identify entities in the data and support various data management activities. Still, UCCs are usually not explicitly defined and need to be discovered. State-of-the-art data profiling algorithms are able to efficiently discover UCCs in moderately sized datasets, but they tend to fail on large and, in particular, on wide datasets due to run time and memory limitations. <br /> In this paper, we introduce HPIValid, a novel UCC discovery algorithm that implements a faster and more resource-saving search strategy. HPIValid models the metadata discovery as a hitting set enumeration problem in hypergraphs. In this way, it combines efficient discovery techniques from data profiling research with the most recent theoretical insights into enumeration algorithms. Our evaluation shows that HPIValid is not only orders of magnitude faster than related work, it also has a much smaller memory footprint.},
  language  = {en}
}
@misc{KruseKaoudiContrerasRojasetal.2020,
  author    = {Kruse, Sebastian and Kaoudi, Zoi and Contreras-Rojas, Bertty and Chawla, Sanjay and Naumann, Felix and Quian{\´e}-Ruiz, Jorge-Arnulfo},
  title     = {RHEEMix in the data jungle},
  series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t},
  journal   = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t},
  number    = {6},
  doi       = {10.25932/publishup-51944},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-519443},
  pages     = {26},
  year      = {2020},
  abstract  = {Data analytics are moving beyond the limits of a single platform. In this paper, we present the cost-based optimizer of Rheem, an open-source cross-platform system that copes with these new requirements. The optimizer allocates the subtasks of data analytic tasks to the most suitable platforms. Our main contributions are: (i) a mechanism based on graph transformations to explore alternative execution strategies; (ii) a novel graph-based approach to determine efficient data movement plans among subtasks and platforms; and (iii) an efficient plan enumeration algorithm, based on a novel enumeration algebra. We extensively evaluate our optimizer under diverse real tasks. We show that our optimizer can perform tasks more than one order of magnitude faster when using multiple platforms than when using a single platform.},
  language  = {en}
}
@article{KruseKaoudiContrerasRojasetal.2020,
  author    = {Kruse, Sebastian and Kaoudi, Zoi and Contreras-Rojas, Bertty and Chawla, Sanjay and Naumann, Felix and Quiane-Ruiz, Jorge-Arnulfo},
  title     = {RHEEMix in the data jungle},
  series = {The VLDB Journal},
  volume    = {29},
  journal   = {The VLDB Journal},
  number    = {6},
  publisher = {Springer},
  address   = {Berlin},
  issn      = {1066-8888},
  doi       = {10.1007/s00778-020-00612-x},
  pages     = {1287 -- 1310},
  year      = {2020},
  abstract  = {Data analytics are moving beyond the limits of a single platform. In this paper, we present the cost-based optimizer of Rheem, an open-source cross-platform system that copes with these new requirements. The optimizer allocates the subtasks of data analytic tasks to the most suitable platforms. Our main contributions are: (i) a mechanism based on graph transformations to explore alternative execution strategies; (ii) a novel graph-based approach to determine efficient data movement plans among subtasks and platforms; and (iii) an efficient plan enumeration algorithm, based on a novel enumeration algebra. We extensively evaluate our optimizer under diverse real tasks. We show that our optimizer can perform tasks more than one order of magnitude faster when using multiple platforms than when using a single platform.},
  language  = {en}
}
@article{DraisbachChristenNaumann2019,
  author    = {Draisbach, Uwe and Christen, Peter and Naumann, Felix},
  title     = {Transforming pairwise duplicates to entity clusters for high-quality duplicate detection},
  series = {ACM Journal of Data and Information Quality},
  volume    = {12},
  journal   = {ACM Journal of Data and Information Quality},
  number    = {1},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {1936-1955},
  doi       = {10.1145/3352591},
  pages     = {1 -- 30},
  year      = {2019},
  abstract  = {Duplicate detection algorithms produce clusters of database records, each cluster representing a single real-world entity. As most of these algorithms use pairwise comparisons, the resulting (transitive) clusters can be inconsistent: Not all records within a cluster are sufficiently similar to be classified as duplicate. Thus, one of many subsequent clustering algorithms can further improve the result. <br /> We explain in detail, compare, and evaluate many of these algorithms and introduce three new clustering algorithms in the specific context of duplicate detection. Two of our three new algorithms use the structure of the input graph to create consistent clusters. Our third algorithm, and many other clustering algorithms, focus on the edge weights, instead. For evaluation, in contrast to related work, we experiment on true real-world datasets, and in addition examine in great detail various pair-selection strategies used in practice. While no overall winner emerges, we are able to identify best approaches for different situations. In scenarios with larger clusters, our proposed algorithm, Extended Maximum Clique Clustering (EMCC), and Markov Clustering show the best results. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.},
  language  = {en}
}
@misc{KruseKaoudiQuianeRuizetal.2019,
  author    = {Kruse, Sebastian and Kaoudi, Zoi and Quiane-Ruiz, Jorge-Arnulfo and Chawla, Sanjay and Naumann, Felix and Contreras-Rojas, Bertty},
  title     = {Optimizing Cross-Platform Data Movement},
  series = {2019 IEEE 35th International Conference on Data Engineering (ICDE)},
  journal   = {2019 IEEE 35th International Conference on Data Engineering (ICDE)},
  publisher = {IEEE},
  address   = {New York},
  isbn      = {978-1-5386-7474-1},
  issn      = {1084-4627},
  doi       = {10.1109/ICDE.2019.00162},
  pages     = {1642 -- 1645},
  year      = {2019},
  abstract  = {Data analytics are moving beyond the limits of a single data processing platform. A cross-platform query optimizer is necessary to enable applications to run their tasks over multiple platforms efficiently and in a platform-agnostic manner. For the optimizer to be effective, it must consider data movement costs across different data processing platforms. In this paper, we present the graph-based data movement strategy used by RHEEM, our open-source cross-platform system. In particular, we (i) model the data movement problem as a new graph problem, which we prove to be NP-hard, and (ii) propose a novel graph exploration algorithm, which allows RHEEM to discover multiple hidden opportunities for cross-platform data processing.},
  language  = {en}
}
@article{BleifussBornemannJohnsonetal.2018,
  author    = {Bleifuss, Tobias and Bornemann, Leon and Johnson, Theodore and Kalashnikov, Dmitri and Naumann, Felix and Srivastava, Divesh},
  title     = {Exploring Change},
  series = {Proceedings of the VLDB Endowment},
  volume    = {12},
  journal   = {Proceedings of the VLDB Endowment},
  number    = {2},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {2150-8097},
  doi       = {10.14778/3282495.3282496},
  pages     = {85 -- 98},
  year      = {2018},
  abstract  = {Data and metadata in datasets experience many different kinds of change. Values axe inserted, deleted or updated; rows appear and disappear; columns are added or repurposed, etc. In such a dynamic situation, users might have many questions related to changes in the dataset, for instance which parts of the data are trustworthy and which are not? Users will wonder: How many changes have there been in the recent minutes, days or years? What kind of changes were made at which points of time? How dirty is the data? Is data cleansing required? The fact that data changed can hint at different hidden processes or agendas: a frequently crowd-updated city name may be controversial; a person whose name has been recently changed may be the target of vandalism; and so on. We show various use cases that benefit from recognizing and exploring such change. We envision a system and methods to interactively explore such change, addressing the variability dimension of big data challenges. To this end, we propose a model to capture change and the process of exploring dynamic data to identify salient changes. We provide exploration primitives along with motivational examples and measures for the volatility of data. We identify technical challenges that need to be addressed to make our vision a reality, and propose directions of future work for the data management community.},
  language  = {en}
}