@article{GronauSchaefer2021, author = {Gronau, Norbert and Schaefer, Martin}, title = {Why metadata matters for the future of copyright}, series = {European Intellectual Property Review}, volume = {43}, journal = {European Intellectual Property Review}, number = {8}, publisher = {Sweet \& Maxwell}, address = {London}, issn = {0142-0461}, pages = {488 -- 494}, year = {2021}, abstract = {In the copyright industries of the 21st century, metadata is the grease required to make the engine of copyright run smoothly and powerfully for the benefit of creators, copyright industries and users alike. However, metadata is difficult to acquire and even more difficult to keep up to date as the rights in content are mostly multi-layered, fragmented, international and volatile. This article explores the idea of a neutral metadata search and enhancement tool that could constitute a buffer to safeguard the interests of the various proprietary database owners and avoid the shortcomings of centralised databases.}, language = {en} } @phdthesis{Kruse2018, author = {Kruse, Sebastian}, title = {Scalable data profiling}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-412521}, school = {Universit{\"a}t Potsdam}, pages = {ii, 156}, year = {2018}, abstract = {Data profiling is the act of extracting structural metadata from datasets. Structural metadata, such as data dependencies and statistics, can support data management operations, such as data integration and data cleaning. Data management often is the most time-consuming activity in any data-related project. Its support is extremely valuable in our data-driven world, so that more time can be spent on the actual utilization of the data, e. g., building analytical models. In most scenarios, however, structural metadata is not given and must be extracted first. Therefore, efficient data profiling methods are highly desirable. Data profiling is a computationally expensive problem; in fact, most dependency discovery problems entail search spaces that grow exponentially in the number of attributes. To this end, this thesis introduces novel discovery algorithms for various types of data dependencies - namely inclusion dependencies, conditional inclusion dependencies, partial functional dependencies, and partial unique column combinations - that considerably improve over state-of-the-art algorithms in terms of efficiency and that scale to datasets that cannot be processed by existing algorithms. The key to those improvements are not only algorithmic innovations, such as novel pruning rules or traversal strategies, but also algorithm designs tailored for distributed execution. While distributed data profiling has been mostly neglected by previous works, it is a logical consequence on the face of recent hardware trends and the computational hardness of dependency discovery. To demonstrate the utility of data profiling for data management, this thesis furthermore presents Metacrate, a database for structural metadata. Its salient features are its flexible data model, the capability to integrate various kinds of structural metadata, and its rich metadata analytics library. We show how to perform a data anamnesis of unknown, complex datasets based on this technology. In particular, we describe in detail how to reconstruct the schemata and assess their quality as part of the data anamnesis. The data profiling algorithms and Metacrate have been carefully implemented, integrated with the Metanome data profiling tool, and are available as free software. In that way, we intend to allow for easy repeatability of our research results and also provide them for actual usage in real-world data-related projects.}, language = {en} } @phdthesis{Papenbrock2017, author = {Papenbrock, Thorsten}, title = {Data profiling - efficient discovery of dependencies}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-406705}, school = {Universit{\"a}t Potsdam}, pages = {viii, ii, 141}, year = {2017}, abstract = {Data profiling is the computer science discipline of analyzing a given dataset for its metadata. The types of metadata range from basic statistics, such as tuple counts, column aggregations, and value distributions, to much more complex structures, in particular inclusion dependencies (INDs), unique column combinations (UCCs), and functional dependencies (FDs). If present, these statistics and structures serve to efficiently store, query, change, and understand the data. Most datasets, however, do not provide their metadata explicitly so that data scientists need to profile them. While basic statistics are relatively easy to calculate, more complex structures present difficult, mostly NP-complete discovery tasks; even with good domain knowledge, it is hardly possible to detect them manually. Therefore, various profiling algorithms have been developed to automate the discovery. None of them, however, can process datasets of typical real-world size, because their resource consumptions and/or execution times exceed effective limits. In this thesis, we propose novel profiling algorithms that automatically discover the three most popular types of complex metadata, namely INDs, UCCs, and FDs, which all describe different kinds of key dependencies. The task is to extract all valid occurrences from a given relational instance. The three algorithms build upon known techniques from related work and complement them with algorithmic paradigms, such as divide \& conquer, hybrid search, progressivity, memory sensitivity, parallelization, and additional pruning to greatly improve upon current limitations. Our experiments show that the proposed algorithms are orders of magnitude faster than related work. They are, in particular, now able to process datasets of real-world, i.e., multiple gigabytes size with reasonable memory and time consumption. Due to the importance of data profiling in practice, industry has built various profiling tools to support data scientists in their quest for metadata. These tools provide good support for basic statistics and they are also able to validate individual dependencies, but they lack real discovery features even though some fundamental discovery techniques are known for more than 15 years. To close this gap, we developed Metanome, an extensible profiling platform that incorporates not only our own algorithms but also many further algorithms from other researchers. With Metanome, we make our research accessible to all data scientists and IT-professionals that are tasked with data profiling. Besides the actual metadata discovery, the platform also offers support for the ranking and visualization of metadata result sets. Being able to discover the entire set of syntactically valid metadata naturally introduces the subsequent task of extracting only the semantically meaningful parts. This is challenge, because the complete metadata results are surprisingly large (sometimes larger than the datasets itself) and judging their use case dependent semantic relevance is difficult. To show that the completeness of these metadata sets is extremely valuable for their usage, we finally exemplify the efficient processing and effective assessment of functional dependencies for the use case of schema normalization.}, language = {en} } @article{vanHoolandVerborghDeWildeetal.2013, author = {van Hooland, Seth and Verborgh, Ruben and De Wilde, Max and Hercher, Johannes and Mannens, Erik and Van de Walle, Rik}, title = {Evaluating the success of vocabulary reconciliation for cultural heritage collections}, series = {Journal of the American Society for Information Science and Technology}, volume = {64}, journal = {Journal of the American Society for Information Science and Technology}, number = {3}, publisher = {Wiley-Blackwell}, address = {Hoboken}, issn = {1532-2882}, doi = {10.1002/asi.22763}, pages = {464 -- 479}, year = {2013}, abstract = {The concept of Linked Data has made its entrance in the cultural heritage sector due to its potential use for the integration of heterogeneous collections and deriving additional value out of existing metadata. However, practitioners and researchers alike need a better understanding of what outcome they can reasonably expect of the reconciliation process between their local metadata and established controlled vocabularies which are already a part of the Linked Data cloud. This paper offers an in-depth analysis of how a locally developed vocabulary can be successfully reconciled with the Library of Congress Subject Headings (LCSH) and the Arts and Architecture Thesaurus (AAT) through the help of a general-purpose tool for interactive data transformation (OpenRefine). Issues negatively affecting the reconciliation process are identified and solutions are proposed in order to derive maximum value from existing metadata and controlled vocabularies in an automated manner.}, language = {en} }