@phdthesis{Zieger2017, author = {Zieger, Tobias}, title = {Self-adaptive data quality}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-410573}, school = {Universit{\"a}t Potsdam}, pages = {vii, 125}, year = {2017}, abstract = {Carrying out business processes successfully is closely linked to the quality of the data inventory in an organization. Lacks in data quality lead to problems: Incorrect address data prevents (timely) shipments to customers. Erroneous orders lead to returns and thus to unnecessary effort. Wrong pricing forces companies to miss out on revenues or to impair customer satisfaction. If orders or customer records cannot be retrieved, complaint management takes longer. Due to erroneous inventories, too few or too much supplies might be reordered. A special problem with data quality and the reason for many of the issues mentioned above are duplicates in databases. Duplicates are different representations of same real-world objects in a dataset. However, these representations differ from each other and are for that reason hard to match by a computer. Moreover, the number of required comparisons to find those duplicates grows with the square of the dataset size. To cleanse the data, these duplicates must be detected and removed. Duplicate detection is a very laborious process. To achieve satisfactory results, appropriate software must be created and configured (similarity measures, partitioning keys, thresholds, etc.). Both requires much manual effort and experience. This thesis addresses automation of parameter selection for duplicate detection and presents several novel approaches that eliminate the need for human experience in parts of the duplicate detection process. A pre-processing step is introduced that analyzes the datasets in question and classifies their attributes semantically. Not only do these annotations help understanding the respective datasets, but they also facilitate subsequent steps, for example, by selecting appropriate similarity measures or normalizing the data upfront. This approach works without schema information. Following that, we show a partitioning technique that strongly reduces the number of pair comparisons for the duplicate detection process. The approach automatically finds particularly suitable partitioning keys that simultaneously allow for effective and efficient duplicate retrieval. By means of a user study, we demonstrate that this technique finds partitioning keys that outperform expert suggestions and additionally does not need manual configuration. Furthermore, this approach can be applied independently of the attribute types. To measure the success of a duplicate detection process and to execute the described partitioning approach, a gold standard is required that provides information about the actual duplicates in a training dataset. This thesis presents a technique that uses existing duplicate detection results and crowdsourcing to create a near gold standard that can be used for the purposes above. Another part of the thesis describes and evaluates strategies how to reduce these crowdsourcing costs and to achieve a consensus with less effort.}, language = {en} } @phdthesis{Hameed2024, author = {Hameed, Mazhar}, title = {Structural preparation of raw data files}, doi = {10.25932/publishup-65567}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-655678}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 117}, year = {2024}, abstract = {Data preparation stands as a cornerstone in the landscape of data science workflows, commanding a significant portion—approximately 80\%—of a data scientist's time. The extensive time consumption in data preparation is primarily attributed to the intricate challenge faced by data scientists in devising tailored solutions for downstream tasks. This complexity is further magnified by the inadequate availability of metadata, the often ad-hoc nature of preparation tasks, and the necessity for data scientists to grapple with a diverse range of sophisticated tools, each presenting its unique intricacies and demands for proficiency. Previous research in data management has traditionally concentrated on preparing the content within columns and rows of a relational table, addressing tasks, such as string disambiguation, date standardization, or numeric value normalization, commonly referred to as data cleaning. This focus assumes a perfectly structured input table. Consequently, the mentioned data cleaning tasks can be effectively applied only after the table has been successfully loaded into the respective data cleaning environment, typically in the later stages of the data processing pipeline. While current data cleaning tools are well-suited for relational tables, extensive data repositories frequently contain data stored in plain text files, such as CSV files, due to their adaptable standard. Consequently, these files often exhibit tables with a flexible layout of rows and columns, lacking a relational structure. This flexibility often results in data being distributed across cells in arbitrary positions, typically guided by user-specified formatting guidelines. Effectively extracting and leveraging these tables in subsequent processing stages necessitates accurate parsing. This thesis emphasizes what we define as the "structure" of a data file—the fundamental characters within a file essential for parsing and comprehending its content. Concentrating on the initial stages of the data preprocessing pipeline, this thesis addresses two crucial aspects: comprehending the structural layout of a table within a raw data file and automatically identifying and rectifying any structural issues that might hinder its parsing. Although these issues may not directly impact the table's content, they pose significant challenges in parsing the table within the file. Our initial contribution comprises an extensive survey of commercially available data preparation tools. This survey thoroughly examines their distinct features, the lacking features, and the necessity for preliminary data processing despite these tools. The primary goal is to elucidate the current state-of-the-art in data preparation systems while identifying areas for enhancement. Furthermore, the survey explores the encountered challenges in data preprocessing, emphasizing opportunities for future research and improvement. Next, we propose a novel data preparation pipeline designed for detecting and correcting structural errors. The aim of this pipeline is to assist users at the initial preprocessing stage by ensuring the correct loading of their data into their preferred systems. Our approach begins by introducing SURAGH, an unsupervised system that utilizes a pattern-based method to identify dominant patterns within a file, independent of external information, such as data types, row structures, or schemata. By identifying deviations from the dominant pattern, it detects ill-formed rows. Subsequently, our structure correction system, TASHEEH, gathers the identified ill-formed rows along with dominant patterns and employs a novel pattern transformation algebra to automatically rectify errors. Our pipeline serves as an end-to-end solution, transforming a structurally broken CSV file into a well-formatted one, usually suitable for seamless loading. Finally, we introduce MORPHER, a user-friendly GUI integrating the functionalities of both SURAGH and TASHEEH. This interface empowers users to access the pipeline's features through visual elements. Our extensive experiments demonstrate the effectiveness of our data preparation systems, requiring no user involvement. Both SURAGH and TASHEEH outperform existing state-of-the-art methods significantly in both precision and recall.}, language = {en} } @phdthesis{Draisbach2022, author = {Draisbach, Uwe}, title = {Efficient duplicate detection and the impact of transitivity}, doi = {10.25932/publishup-57214}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-572140}, school = {Universit{\"a}t Potsdam}, pages = {x, 150}, year = {2022}, abstract = {Duplicate detection describes the process of finding multiple representations of the same real-world entity in the absence of a unique identifier, and has many application areas, such as customer relationship management, genealogy and social sciences, or online shopping. Due to the increasing amount of data in recent years, the problem has become even more challenging on the one hand, but has led to a renaissance in duplicate detection research on the other hand. This thesis examines the effects and opportunities of transitive relationships on the duplicate detection process. Transitivity implies that if record pairs ⟨ri,rj⟩ and ⟨rj,rk⟩ are classified as duplicates, then also record pair ⟨ri,rk⟩ has to be a duplicate. However, this reasoning might contradict with the pairwise classification, which is usually based on the similarity of objects. An essential property of similarity, in contrast to equivalence, is that similarity is not necessarily transitive. First, we experimentally evaluate the effect of an increasing data volume on the threshold selection to classify whether a record pair is a duplicate or non-duplicate. Our experiments show that independently of the pair selection algorithm and the used similarity measure, selecting a suitable threshold becomes more difficult with an increasing number of records due to an increased probability of adding a false duplicate to an existing cluster. Thus, the best threshold changes with the dataset size, and a good threshold for a small (possibly sampled) dataset is not necessarily a good threshold for a larger (possibly complete) dataset. As data grows over time, earlier selected thresholds are no longer a suitable choice, and the problem becomes worse for datasets with larger clusters. Second, we present with the Duplicate Count Strategy (DCS) and its enhancement DCS++ two alternatives to the standard Sorted Neighborhood Method (SNM) for the selection of candidate record pairs. DCS adapts SNMs window size based on the number of detected duplicates and DCS++ uses transitive dependencies to save complex comparisons for finding duplicates in larger clusters. We prove that with a proper (domain- and data-independent!) threshold, DCS++ is more efficient than SNM without loss of effectiveness. Third, we tackle the problem of contradicting pairwise classifications. Usually, the transitive closure is used for pairwise classifications to obtain a transitively closed result set. However, the transitive closure disregards negative classifications. We present three new and several existing clustering algorithms and experimentally evaluate them on various datasets and under various algorithm configurations. The results show that the commonly used transitive closure is inferior to most other clustering algorithms, especially for the precision of results. In scenarios with larger clusters, our proposed EMCC algorithm is, together with Markov Clustering, the best performing clustering approach for duplicate detection, although its runtime is longer than Markov Clustering due to the subexponential time complexity. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.}, language = {en} }