@phdthesis{Bauckmann2013, author = {Bauckmann, Jana}, title = {Dependency discovery for data integration}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-66645}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {Data integration aims to combine data of different sources and to provide users with a unified view on these data. This task is as challenging as valuable. In this thesis we propose algorithms for dependency discovery to provide necessary information for data integration. We focus on inclusion dependencies (INDs) in general and a special form named conditional inclusion dependencies (CINDs): (i) INDs enable the discovery of structure in a given schema. (ii) INDs and CINDs support the discovery of cross-references or links between schemas. An IND "A in B" simply states that all values of attribute A are included in the set of values of attribute B. We propose an algorithm that discovers all inclusion dependencies in a relational data source. The challenge of this task is the complexity of testing all attribute pairs and further of comparing all of each attribute pair's values. The complexity of existing approaches depends on the number of attribute pairs, while ours depends only on the number of attributes. Thus, our algorithm enables to profile entirely unknown data sources with large schemas by discovering all INDs. Further, we provide an approach to extract foreign keys from the identified INDs. We extend our IND discovery algorithm to also find three special types of INDs: (i) Composite INDs, such as "AB in CD", (ii) approximate INDs that allow a certain amount of values of A to be not included in B, and (iii) prefix and suffix INDs that represent special cross-references between schemas. Conditional inclusion dependencies are inclusion dependencies with a limited scope defined by conditions over several attributes. Only the matching part of the instance must adhere the dependency. We generalize the definition of CINDs distinguishing covering and completeness conditions and define quality measures for conditions. We propose efficient algorithms that identify covering and completeness conditions conforming to given quality thresholds. The challenge for this task is twofold: (i) Which (and how many) attributes should be used for the conditions? (ii) Which attribute values should be chosen for the conditions? Previous approaches rely on pre-selected condition attributes or can only discover conditions applying to quality thresholds of 100\%. Our approaches were motivated by two application domains: data integration in the life sciences and link discovery for linked open data. We show the efficiency and the benefits of our approaches for use cases in these domains.}, language = {en} }