@book{BauckmannLeserNaumann2010, author = {Bauckmann, Jana and Leser, Ulf and Naumann, Felix}, title = {Efficient and exact computation of inclusion dependencies for data integration}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-048-9}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-41396}, publisher = {Universit{\"a}t Potsdam}, pages = {36}, year = {2010}, abstract = {Data obtained from foreign data sources often come with only superficial structural information, such as relation names and attribute names. Other types of metadata that are important for effective integration and meaningful querying of such data sets are missing. In particular, relationships among attributes, such as foreign keys, are crucial metadata for understanding the structure of an unknown database. The discovery of such relationships is difficult, because in principle for each pair of attributes in the database each pair of data values must be compared. A precondition for a foreign key is an inclusion dependency (IND) between the key and the foreign key attributes. We present with Spider an algorithm that efficiently finds all INDs in a given relational database. It leverages the sorting facilities of DBMS but performs the actual comparisons outside of the database to save computation. Spider analyzes very large databases up to an order of magnitude faster than previous approaches. We also evaluate in detail the effectiveness of several heuristics to reduce the number of necessary comparisons. Furthermore, we generalize Spider to find composite INDs covering multiple attributes, and partial INDs, which are true INDs for all but a certain number of values. This last type is particularly relevant when integrating dirty data as is often the case in the life sciences domain - our driving motivation.}, language = {en} } @misc{Donges2009, type = {Master Thesis}, author = {Donges, Jonathan}, title = {Complex networks in the climate system}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-49775}, school = {Universit{\"a}t Potsdam}, year = {2009}, abstract = {Complex network theory provides an elegant and powerful framework to statistically investigate the topology of local and long range dynamical interrelationships, i.e., teleconnections, in the climate system. Employing a refined methodology relying on linear and nonlinear measures of time series analysis, the intricate correlation structure within a multivariate climatological data set is cast into network form. Within this graph theoretical framework, vertices are identified with grid points taken from the data set representing a region on the the Earth's surface, and edges correspond to strong statistical interrelationships between the dynamics on pairs of grid points. The resulting climate networks are neither perfectly regular nor completely random, but display the intriguing and nontrivial characteristics of complexity commonly found in real world networks such as the internet, citation and acquaintance networks, food webs and cortical networks in the mammalian brain. Among other interesting properties, climate networks exhibit the "small-world" effect and possess a broad degree distribution with dominating super-nodes as well as a pronounced community structure. We have performed an extensive and detailed graph theoretical analysis of climate networks on the global topological scale focussing on the flow and centrality measure betweenness which is locally defined at each vertex, but includes global topological information by relying on the distribution of shortest paths between all pairs of vertices in the network. The betweenness centrality field reveals a rich internal structure in complex climate networks constructed from reanalysis and atmosphere-ocean coupled general circulation model (AOGCM) surface air temperature data. Our novel approach uncovers an elaborately woven meta-network of highly localized channels of strong dynamical information flow, that we relate to global surface ocean currents and dub the backbone of the climate network in analogy to the homonymous data highways of the internet. This finding points to a major role of the oceanic surface circulation in coupling and stabilizing the global temperature field in the long term mean (140 years for the model run and 60 years for reanalysis data). Carefully comparing the backbone structures detected in climate networks constructed using linear Pearson correlation and nonlinear mutual information, we argue that the high sensitivity of betweenness with respect to small changes in network structure may allow to detect the footprints of strongly nonlinear physical interactions in the climate system. The results presented in this thesis are thoroughly founded and substantiated using a hierarchy of statistical significance tests on the level of time series and networks, i.e., by tests based on time series surrogates as well as network surrogates. This is particularly relevant when working with real world data. Specifically, we developed new types of network surrogates to include the additional constraints imposed by the spatial embedding of vertices in a climate network. Our methodology is of potential interest for a broad audience within the physics community and various applied fields, because it is universal in the sense of being valid for any spatially extended dynamical system. It can help to understand the localized flow of dynamical information in any such system by combining multivariate time series analysis, a complex network approach and the information flow measure betweenness centrality. Possible fields of application include fluid dynamics (turbulence), plasma physics and biological physics (population models, neural networks, cell models). Furthermore, the climate network approach is equally relevant for experimental data as well as model simulations and hence introduces a novel perspective on model evaluation and data driven model building. Our work is timely in the context of the current debate on climate change within the scientific community, since it allows to assess from a new perspective the regional vulnerability and stability of the climate system while relying on global and not only on regional knowledge. The methodology developed in this thesis hence has the potential to substantially contribute to the understanding of the local effect of extreme events and tipping points in the earth system within a holistic global framework.}, language = {en} }