@phdthesis{Draisbach2022, author = {Draisbach, Uwe}, title = {Efficient duplicate detection and the impact of transitivity}, doi = {10.25932/publishup-57214}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-572140}, school = {Universit{\"a}t Potsdam}, pages = {x, 150}, year = {2022}, abstract = {Duplicate detection describes the process of finding multiple representations of the same real-world entity in the absence of a unique identifier, and has many application areas, such as customer relationship management, genealogy and social sciences, or online shopping. Due to the increasing amount of data in recent years, the problem has become even more challenging on the one hand, but has led to a renaissance in duplicate detection research on the other hand. This thesis examines the effects and opportunities of transitive relationships on the duplicate detection process. Transitivity implies that if record pairs ⟨ri,rj⟩ and ⟨rj,rk⟩ are classified as duplicates, then also record pair ⟨ri,rk⟩ has to be a duplicate. However, this reasoning might contradict with the pairwise classification, which is usually based on the similarity of objects. An essential property of similarity, in contrast to equivalence, is that similarity is not necessarily transitive. First, we experimentally evaluate the effect of an increasing data volume on the threshold selection to classify whether a record pair is a duplicate or non-duplicate. Our experiments show that independently of the pair selection algorithm and the used similarity measure, selecting a suitable threshold becomes more difficult with an increasing number of records due to an increased probability of adding a false duplicate to an existing cluster. Thus, the best threshold changes with the dataset size, and a good threshold for a small (possibly sampled) dataset is not necessarily a good threshold for a larger (possibly complete) dataset. As data grows over time, earlier selected thresholds are no longer a suitable choice, and the problem becomes worse for datasets with larger clusters. Second, we present with the Duplicate Count Strategy (DCS) and its enhancement DCS++ two alternatives to the standard Sorted Neighborhood Method (SNM) for the selection of candidate record pairs. DCS adapts SNMs window size based on the number of detected duplicates and DCS++ uses transitive dependencies to save complex comparisons for finding duplicates in larger clusters. We prove that with a proper (domain- and data-independent!) threshold, DCS++ is more efficient than SNM without loss of effectiveness. Third, we tackle the problem of contradicting pairwise classifications. Usually, the transitive closure is used for pairwise classifications to obtain a transitively closed result set. However, the transitive closure disregards negative classifications. We present three new and several existing clustering algorithms and experimentally evaluate them on various datasets and under various algorithm configurations. The results show that the commonly used transitive closure is inferior to most other clustering algorithms, especially for the precision of results. In scenarios with larger clusters, our proposed EMCC algorithm is, together with Markov Clustering, the best performing clustering approach for duplicate detection, although its runtime is longer than Markov Clustering due to the subexponential time complexity. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.}, language = {en} } @phdthesis{Harmouch2020, author = {Harmouch, Hazar}, title = {Single-column data profiling}, doi = {10.25932/publishup-47455}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-474554}, school = {Universit{\"a}t Potsdam}, pages = {x, 115}, year = {2020}, abstract = {The research area of data profiling consists of a large set of methods and processes to examine a given dataset and determine metadata about it. Typically, different data profiling tasks address different kinds of metadata, comprising either various statistics about individual columns (Single-column Analysis) or relationships among them (Dependency Discovery). Among the basic statistics about a column are data type, header, the number of unique values (the column's cardinality), maximum and minimum values, the number of null values, and the value distribution. Dependencies involve, for instance, functional dependencies (FDs), inclusion dependencies (INDs), and their approximate versions. Data profiling has a wide range of conventional use cases, namely data exploration, cleansing, and integration. The produced metadata is also useful for database management and schema reverse engineering. Data profiling has also more novel use cases, such as big data analytics. The generated metadata describes the structure of the data at hand, how to import it, what it is about, and how much of it there is. Thus, data profiling can be considered as an important preparatory task for many data analysis and mining scenarios to assess which data might be useful and to reveal and understand a new dataset's characteristics. In this thesis, the main focus is on the single-column analysis class of data profiling tasks. We study the impact and the extraction of three of the most important metadata about a column, namely the cardinality, the header, and the number of null values. First, we present a detailed experimental study of twelve cardinality estimation algorithms. We classify the algorithms and analyze their efficiency, scaling far beyond the original experiments and testing theoretical guarantees. Our results highlight their trade-offs and point out the possibility to create a parallel or a distributed version of these algorithms to cope with the growing size of modern datasets. Then, we present a fully automated, multi-phase system to discover human-understandable, representative, and consistent headers for a target table in cases where headers are missing, meaningless, or unrepresentative for the column values. Our evaluation on Wikipedia tables shows that 60\% of the automatically discovered schemata are exact and complete. Considering more schema candidates, top-5 for example, increases this percentage to 72\%. Finally, we formally and experimentally show the ghost and fake FDs phenomenon caused by FD discovery over datasets with missing values. We propose two efficient scores, probabilistic and likelihood-based, for estimating the genuineness of a discovered FD. Our extensive set of experiments on real-world and semi-synthetic datasets show the effectiveness and efficiency of these scores.}, language = {en} } @phdthesis{Zieger2017, author = {Zieger, Tobias}, title = {Self-adaptive data quality}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-410573}, school = {Universit{\"a}t Potsdam}, pages = {vii, 125}, year = {2017}, abstract = {Carrying out business processes successfully is closely linked to the quality of the data inventory in an organization. Lacks in data quality lead to problems: Incorrect address data prevents (timely) shipments to customers. Erroneous orders lead to returns and thus to unnecessary effort. Wrong pricing forces companies to miss out on revenues or to impair customer satisfaction. If orders or customer records cannot be retrieved, complaint management takes longer. Due to erroneous inventories, too few or too much supplies might be reordered. A special problem with data quality and the reason for many of the issues mentioned above are duplicates in databases. Duplicates are different representations of same real-world objects in a dataset. However, these representations differ from each other and are for that reason hard to match by a computer. Moreover, the number of required comparisons to find those duplicates grows with the square of the dataset size. To cleanse the data, these duplicates must be detected and removed. Duplicate detection is a very laborious process. To achieve satisfactory results, appropriate software must be created and configured (similarity measures, partitioning keys, thresholds, etc.). Both requires much manual effort and experience. This thesis addresses automation of parameter selection for duplicate detection and presents several novel approaches that eliminate the need for human experience in parts of the duplicate detection process. A pre-processing step is introduced that analyzes the datasets in question and classifies their attributes semantically. Not only do these annotations help understanding the respective datasets, but they also facilitate subsequent steps, for example, by selecting appropriate similarity measures or normalizing the data upfront. This approach works without schema information. Following that, we show a partitioning technique that strongly reduces the number of pair comparisons for the duplicate detection process. The approach automatically finds particularly suitable partitioning keys that simultaneously allow for effective and efficient duplicate retrieval. By means of a user study, we demonstrate that this technique finds partitioning keys that outperform expert suggestions and additionally does not need manual configuration. Furthermore, this approach can be applied independently of the attribute types. To measure the success of a duplicate detection process and to execute the described partitioning approach, a gold standard is required that provides information about the actual duplicates in a training dataset. This thesis presents a technique that uses existing duplicate detection results and crowdsourcing to create a near gold standard that can be used for the purposes above. Another part of the thesis describes and evaluates strategies how to reduce these crowdsourcing costs and to achieve a consensus with less effort.}, language = {en} } @phdthesis{Pohlenz2008, author = {Pohlenz, Philipp}, title = {Datenqualit{\"a}t als Schl{\"u}sselfrage der Qualit{\"a}tssicherung von Lehre und Studium an Hochschulen}, series = {Potsdamer Beitr{\"a}ge zur Lehrevaluation}, journal = {Potsdamer Beitr{\"a}ge zur Lehrevaluation}, number = {3}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-940793-48-5}, issn = {1862-8664}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-19755}, school = {Universit{\"a}t Potsdam}, pages = {244}, year = {2008}, abstract = {Hochschulen stehen zunehmend vor einem Legitimationsproblem bez{\"u}glich ihres Umgangs mit ({\"o}ffentlich bereit gestellten) Ressourcen. Die Kritik bezieht sich haupts{\"a}chlich auf den Leistungsbereich der Lehre. Diese sei ineffektiv organisiert und trage durch schlechte Studienbedingungen - die ihrerseits von den Hochschulen selbst zu verantworten seien - zu langen Studienzeiten und hohen Abbruchquoten bei. Es wird konstatiert, dass mit der Lebenszeit der Studierenden verantwortungslos umgegangen und der gesellschaftliche Ausbildungsauftrag sowohl von der Hochschule im Ganzen, als auch von einzelnen Lehrenden nicht angemessen wahrgenommen werde. Um die gleichzeitig steigende Nachfrage nach akademischen Bildungsangeboten befriedigen zu k{\"o}nnen, vollziehen Hochschulen einen Wandel zu Dienstleistungsunternehmen, deren Leistungsf{\"a}higkeit sich an der Effizienz ihrer Angebote bemisst. Ein solches Leitbild ist von den Steuerungsgrunds{\"a}tzen des New Public Management inspiriert. In diesem zieht sich der Staat aus der traditionell engen Verbindung zu den Hochschulen zur{\"u}ck und gew{\"a}hrt diesen lokale Autonomie, bspw. durch die Einf{\"u}hrung globaler Haushalte zu ihrer finanziellen Selbststeuerung. Die Hochschulen werden zu Marktakteuren, die sich in der Konkurrenz um Kunden gegen ihre Wettbewerber durchsetzen, indem sie Qualit{\"a}t und Exzellenz unter Beweis stellen. F{\"u}r die Durchf{\"u}hrung von diesbez{\"u}glichen Leistungsvergleichen werden unterschiedliche Verfahren der Evaluation eingesetzt. In diese sind landl{\"a}ufig sowohl Daten der Hochschulstatistik, bspw. in Form von Absolventenquoten, als auch zunehmend Befragungsdaten, meist von Studierenden, zur Erhebung ihrer Qualit{\"a}tseinsch{\"a}tzungen zu Lehre und Studium involviert. Insbesondere letzteren wird vielfach entgegen gehalten, dass sie nicht geeignet seien, die Qualit{\"a}t der Lehre ad{\"a}quat abzubilden. Vielmehr seien sie durch subjektive Verzerrungen in ihrer Aussagef{\"a}higkeit eingeschr{\"a}nkt. Eine Beurteilung, die auf studentischen Befragungsdaten aufsetzt, m{\"u}sse entsprechend zu Fehleinsch{\"a}tzungen und daraus folgend ungerechten Leistungssanktionen kommen. Im Sinne der Akzeptanz von Verfahren der Evaluation als Instrument hochschulinterner Qualit{\"a}tssicherungs- und -entwicklungsprozesse ist daher zu untersuchen, inwieweit Beeintr{\"a}chtigungen der Validit{\"a}t von f{\"u}r die Hochschulsteuerung eingesetzten Datenbasen deren Aussagekraft vermindern. Ausgehend von den entsprechenden Ergebnissen sind Entwicklungen der Verfahren m{\"o}glich. Diese Frage steht im Zentrum der vorliegenden Arbeit.}, language = {de} }