@phdthesis{Reinhardt2020, author = {Reinhardt, Maria}, title = {Hybrid filters and multi-scale models}, doi = {10.25932/publishup-47435}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-474356}, school = {Universit{\"a}t Potsdam}, pages = {xiii, 102}, year = {2020}, abstract = {This thesis is concerned with Data Assimilation, the process of combining model predictions with observations. So called filters are of special interest. One is inter- ested in computing the probability distribution of the state of a physical process in the future, given (possibly) imperfect measurements. This is done using Bayes' rule. The first part focuses on hybrid filters, that bridge between the two main groups of filters: ensemble Kalman filters (EnKF) and particle filters. The first are a group of very stable and computationally cheap algorithms, but they request certain strong assumptions. Particle filters on the other hand are more generally applicable, but computationally expensive and as such not always suitable for high dimensional systems. Therefore it exists a need to combine both groups to benefit from the advantages of each. This can be achieved by splitting the likelihood function, when assimilating a new observation and treating one part of it with an EnKF and the other part with a particle filter. The second part of this thesis deals with the application of Data Assimilation to multi-scale models and the problems that arise from that. One of the main areas of application for Data Assimilation techniques is predicting the development of oceans and the atmosphere. These processes involve several scales and often balance rela- tions between the state variables. The use of Data Assimilation procedures most often violates relations of that kind, which leads to unrealistic and non-physical pre- dictions of the future development of the process eventually. This work discusses the inclusion of a post-processing step after each assimilation step, in which a minimi- sation problem is solved, which penalises the imbalance. This method is tested on four different models, two Hamiltonian systems and two spatially extended models, which adds even more difficulties.}, language = {en} } @phdthesis{Perscheid2023, author = {Perscheid, Cindy}, title = {Integrative biomarker detection using prior knowledge on gene expression data sets}, doi = {10.25932/publishup-58241}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-582418}, school = {Universit{\"a}t Potsdam}, pages = {ix, 197}, year = {2023}, abstract = {Gene expression data is analyzed to identify biomarkers, e.g. relevant genes, which serve for diagnostic, predictive, or prognostic use. Traditional approaches for biomarker detection select distinctive features from the data based exclusively on the signals therein, facing multiple shortcomings in regards to overfitting, biomarker robustness, and actual biological relevance. Prior knowledge approaches are expected to address these issues by incorporating prior biological knowledge, e.g. on gene-disease associations, into the actual analysis. However, prior knowledge approaches are currently not widely applied in practice because they are often use-case specific and seldom applicable in a different scope. This leads to a lack of comparability of prior knowledge approaches, which in turn makes it currently impossible to assess their effectiveness in a broader context. Our work addresses the aforementioned issues with three contributions. Our first contribution provides formal definitions for both prior knowledge and the flexible integration thereof into the feature selection process. Central to these concepts is the automatic retrieval of prior knowledge from online knowledge bases, which allows for streamlining the retrieval process and agreeing on a uniform definition for prior knowledge. We subsequently describe novel and generalized prior knowledge approaches that are flexible regarding the used prior knowledge and applicable to varying use case domains. Our second contribution is the benchmarking platform Comprior. Comprior applies the aforementioned concepts in practice and allows for flexibly setting up comprehensive benchmarking studies for examining the performance of existing and novel prior knowledge approaches. It streamlines the retrieval of prior knowledge and allows for combining it with prior knowledge approaches. Comprior demonstrates the practical applicability of our concepts and further fosters the overall development and comparability of prior knowledge approaches. Our third contribution is a comprehensive case study on the effectiveness of prior knowledge approaches. For that, we used Comprior and tested a broad range of both traditional and prior knowledge approaches in combination with multiple knowledge bases on data sets from multiple disease domains. Ultimately, our case study constitutes a thorough assessment of a) the suitability of selected knowledge bases for integration, b) the impact of prior knowledge being applied at different integration levels, and c) the improvements in terms of classification performance, biological relevance, and overall robustness. In summary, our contributions demonstrate that generalized concepts for prior knowledge and a streamlined retrieval process improve the applicability of prior knowledge approaches. Results from our case study show that the integration of prior knowledge positively affects biomarker results, particularly regarding their robustness. Our findings provide the first in-depth insights on the effectiveness of prior knowledge approaches and build a valuable foundation for future research.}, language = {en} } @phdthesis{Wichitsanguan2016, author = {Wichitsa-nguan, Korakot}, title = {Modifications and extensions of the logistic regression and Cox model}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-90033}, school = {Universit{\"a}t Potsdam}, pages = {x, 131}, year = {2016}, abstract = {In many statistical applications, the aim is to model the relationship between covariates and some outcomes. A choice of the appropriate model depends on the outcome and the research objectives, such as linear models for continuous outcomes, logistic models for binary outcomes and the Cox model for time-to-event data. In epidemiological, medical, biological, societal and economic studies, the logistic regression is widely used to describe the relationship between a response variable as binary outcome and explanatory variables as a set of covariates. However, epidemiologic cohort studies are quite expensive regarding data management since following up a large number of individuals takes long time. Therefore, the case-cohort design is applied to reduce cost and time for data collection. The case-cohort sampling collects a small random sample from the entire cohort, which is called subcohort. The advantage of this design is that the covariate and follow-up data are recorded only on the subcohort and all cases (all members of the cohort who develop the event of interest during the follow-up process). In this thesis, we investigate the estimation in the logistic model for case-cohort design. First, a model with a binary response and a binary covariate is considered. The maximum likelihood estimator (MLE) is described and its asymptotic properties are established. An estimator for the asymptotic variance of the estimator based on the maximum likelihood approach is proposed; this estimator differs slightly from the estimator introduced by Prentice (1986). Simulation results for several proportions of the subcohort show that the proposed estimator gives lower empirical bias and empirical variance than Prentice's estimator. Then the MLE in the logistic regression with discrete covariate under case-cohort design is studied. Here the approach of the binary covariate model is extended. Proving asymptotic normality of estimators, standard errors for the estimators can be derived. The simulation study demonstrates the estimation procedure of the logistic regression model with a one-dimensional discrete covariate. Simulation results for several proportions of the subcohort and different choices of the underlying parameters indicate that the estimator developed here performs reasonably well. Moreover, the comparison between theoretical values and simulation results of the asymptotic variance of estimator is presented. Clearly, the logistic regression is sufficient for the binary outcome refers to be available for all subjects and for a fixed time interval. Nevertheless, in practice, the observations in clinical trials are frequently collected for different time periods and subjects may drop out or relapse from other causes during follow-up. Hence, the logistic regression is not appropriate for incomplete follow-up data; for example, an individual drops out of the study before the end of data collection or an individual has not occurred the event of interest for the duration of the study. These observations are called censored observations. The survival analysis is necessary to solve these problems. Moreover, the time to the occurence of the event of interest is taken into account. The Cox model has been widely used in survival analysis, which can effectively handle the censored data. Cox (1972) proposed the model which is focused on the hazard function. The Cox model is assumed to be λ(t|x) = λ0(t) exp(β^Tx) where λ0(t) is an unspecified baseline hazard at time t and X is the vector of covariates, β is a p-dimensional vector of coefficient. In this thesis, the Cox model is considered under the view point of experimental design. The estimability of the parameter β0 in the Cox model, where β0 denotes the true value of β, and the choice of optimal covariates are investigated. We give new representations of the observed information matrix In(β) and extend results for the Cox model of Andersen and Gill (1982). In this way conditions for the estimability of β0 are formulated. Under some regularity conditions, ∑ is the inverse of the asymptotic variance matrix of the MPLE of β0 in the Cox model and then some properties of the asymptotic variance matrix of the MPLE are highlighted. Based on the results of asymptotic estimability, the calculation of local optimal covariates is considered and shown in examples. In a sensitivity analysis, the efficiency of given covariates is calculated. For neighborhoods of the exponential models, the efficiencies have then been found. It is appeared that for fixed parameters β0, the efficiencies do not change very much for different baseline hazard functions. Some proposals for applicable optimal covariates and a calculation procedure for finding optimal covariates are discussed. Furthermore, the extension of the Cox model where time-dependent coefficient are allowed, is investigated. In this situation, the maximum local partial likelihood estimator for estimating the coefficient function β(·) is described. Based on this estimator, we formulate a new test procedure for testing, whether a one-dimensional coefficient function β(·) has a prespecified parametric form, say β(·; ϑ). The score function derived from the local constant partial likelihood function at d distinct grid points is considered. It is shown that the distribution of the properly standardized quadratic form of this d-dimensional vector under the null hypothesis tends to a Chi-squared distribution. Moreover, the limit statement remains true when replacing the unknown ϑ0 by the MPLE in the hypothetical model and an asymptotic α-test is given by the quantiles or p-values of the limiting Chi-squared distribution. Finally, we propose a bootstrap version of this test. The bootstrap test is only defined for the special case of testing whether the coefficient function is constant. A simulation study illustrates the behavior of the bootstrap test under the null hypothesis and a special alternative. It gives quite good results for the chosen underlying model. References P. K. Andersen and R. D. Gill. Cox's regression model for counting processes: a large samplestudy. Ann. Statist., 10(4):1100{1120, 1982. D. R. Cox. Regression models and life-tables. J. Roy. Statist. Soc. Ser. B, 34:187{220, 1972. R. L. Prentice. A case-cohort design for epidemiologic cohort studies and disease prevention trials. Biometrika, 73(1):1{11, 1986.}, language = {en} } @phdthesis{Solms2017, author = {Solms, Alexander Maximilian}, title = {Integrating nonlinear mixed effects and physiologically-based modeling approaches for the analysis of repeated measurement studies}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-397070}, school = {Universit{\"a}t Potsdam}, pages = {x, 141}, year = {2017}, abstract = {During the drug discovery \& development process, several phases encompassing a number of preclinical and clinical studies have to be successfully passed to demonstrate safety and efficacy of a new drug candidate. As part of these studies, the characterization of the drug's pharmacokinetics (PK) is an important aspect, since the PK is assumed to strongly impact safety and efficacy. To this end, drug concentrations are measured repeatedly over time in a study population. The objectives of such studies are to describe the typical PK time-course and the associated variability between subjects. Furthermore, underlying sources significantly contributing to this variability, e.g. the use of comedication, should be identified. The most commonly used statistical framework to analyse repeated measurement data is the nonlinear mixed effect (NLME) approach. At the same time, ample knowledge about the drug's properties already exists and has been accumulating during the discovery \& development process: Before any drug is tested in humans, detailed knowledge about the PK in different animal species has to be collected. This drug-specific knowledge and general knowledge about the species' physiology is exploited in mechanistic physiological based PK (PBPK) modeling approaches -it is, however, ignored in the classical NLME modeling approach. Mechanistic physiological based models aim to incorporate relevant and known physiological processes which contribute to the overlying process of interest. In comparison to data--driven models they are usually more complex from a mathematical perspective. For example, in many situations, the number of model parameters outrange the number of measurements and thus reliable parameter estimation becomes more complex and partly impossible. As a consequence, the integration of powerful mathematical estimation approaches like the NLME modeling approach -which is widely used in data-driven modeling -and the mechanistic modeling approach is not well established; the observed data is rather used as a confirming instead of a model informing and building input. Another aggravating circumstance of an integrated approach is the inaccessibility to the details of the NLME methodology so that these approaches can be adapted to the specifics and needs of mechanistic modeling. Despite the fact that the NLME modeling approach exists for several decades, details of the mathematical methodology is scattered around a wide range of literature and a comprehensive, rigorous derivation is lacking. Available literature usually only covers selected parts of the mathematical methodology. Sometimes, important steps are not described or are only heuristically motivated, e.g. the iterative algorithm to finally determine the parameter estimates. Thus, in the present thesis the mathematical methodology of NLME modeling is systemically described and complemented to a comprehensive description, comprising the common theme from ideas and motivation to the final parameter estimation. Therein, new insights for the interpretation of different approximation methods used in the context of the NLME modeling approach are given and illustrated; furthermore, similarities and differences between them are outlined. Based on these findings, an expectation-maximization (EM) algorithm to determine estimates of a NLME model is described. Using the EM algorithm and the lumping methodology by Pilari2010, a new approach on how PBPK and NLME modeling can be combined is presented and exemplified for the antibiotic levofloxacin. Therein, the lumping identifies which processes are informed by the available data and the respective model reduction improves the robustness in parameter estimation. Furthermore, it is shown how apriori known factors influencing the variability and apriori known unexplained variability is incorporated to further mechanistically drive the model development. Concludingly, correlation between parameters and between covariates is automatically accounted for due to the mechanistic derivation of the lumping and the covariate relationships. A useful feature of PBPK models compared to classical data-driven PK models is in the possibility to predict drug concentration within all organs and tissue in the body. Thus, the resulting PBPK model for levofloxacin is used to predict drug concentrations and their variability within soft tissues which are the site of action for levofloxacin. These predictions are compared with data of muscle and adipose tissue obtained by microdialysis, which is an invasive technique to measure a proportion of drug in the tissue, allowing to approximate the concentrations in the interstitial fluid of tissues. Because, so far, comparing human in vivo tissue PK and PBPK predictions are not established, a new conceptual framework is derived. The comparison of PBPK model predictions and microdialysis measurements shows an adequate agreement and reveals further strengths of the presented new approach. We demonstrated how mechanistic PBPK models, which are usually developed in the early stage of drug development, can be used as basis for model building in the analysis of later stages, i.e. in clinical studies. As a consequence, the extensively collected and accumulated knowledge about species and drug are utilized and updated with specific volunteer or patient data. The NLME approach combined with mechanistic modeling reveals new insights for the mechanistic model, for example identification and quantification of variability in mechanistic processes. This represents a further contribution to the learn \& confirm paradigm across different stages of drug development. Finally, the applicability of mechanism--driven model development is demonstrated on an example from the field of Quantitative Psycholinguistics to analyse repeated eye movement data. Our approach gives new insight into the interpretation of these experiments and the processes behind.}, language = {en} } @phdthesis{Braun2023, author = {Braun, Tobias}, title = {Recurrences in past climates}, doi = {10.25932/publishup-58690}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-586900}, school = {Universit{\"a}t Potsdam}, pages = {xxviii, 251}, year = {2023}, abstract = {Our ability to predict the state of a system relies on its tendency to recur to states it has visited before. Recurrence also pervades common intuitions about the systems we are most familiar with: daily routines, social rituals and the return of the seasons are just a few relatable examples. To this end, recurrence plots (RP) provide a systematic framework to quantify the recurrence of states. Despite their conceptual simplicity, they are a versatile tool in the study of observational data. The global climate is a complex system for which an understanding based on observational data is not only of academical relevance, but vital for the predurance of human societies within the planetary boundaries. Contextualizing current global climate change, however, requires observational data far beyond the instrumental period. The palaeoclimate record offers a valuable archive of proxy data but demands methodological approaches that adequately address its complexities. In this regard, the following dissertation aims at devising novel and further developing existing methods in the framework of recurrence analysis (RA). The proposed research questions focus on using RA to capture scale-dependent properties in nonlinear time series and tailoring recurrence quantification analysis (RQA) to characterize seasonal variability in palaeoclimate records ('Palaeoseasonality'). In the first part of this thesis, we focus on the methodological development of novel approaches in RA. The predictability of nonlinear (palaeo)climate time series is limited by abrupt transitions between regimes that exhibit entirely different dynamical complexity (e.g. crossing of 'tipping points'). These possibly depend on characteristic time scales. RPs are well-established for detecting transitions and capture scale-dependencies, yet few approaches have combined both aspects. We apply existing concepts from the study of self-similar textures to RPs to detect abrupt transitions, considering the most relevant time scales. This combination of methods further results in the definition of a novel recurrence based nonlinear dependence measure. Quantifying lagged interactions between multiple variables is a common problem, especially in the characterization of high-dimensional complex systems. The proposed 'recurrence flow' measure of nonlinear dependence offers an elegant way to characterize such couplings. For spatially extended complex systems, the coupled dynamics of local variables result in the emergence of spatial patterns. These patterns tend to recur in time. Based on this observation, we propose a novel method that entails dynamically distinct regimes of atmospheric circulation based on their recurrent spatial patterns. Bridging the two parts of this dissertation, we next turn to methodological advances of RA for the study of Palaeoseasonality. Observational series of palaeoclimate 'proxy' records involve inherent limitations, such as irregular temporal sampling. We reveal biases in the RQA of time series with a non-stationary sampling rate and propose a correction scheme. In the second part of this thesis, we proceed with applications in Palaeoseasonality. A review of common and promising time series analysis methods shows that numerous valuable tools exist, but their sound application requires adaptions to archive-specific limitations and consolidating transdisciplinary knowledge. Next, we study stalagmite proxy records from the Central Pacific as sensitive recorders of mid-Holocene El Ni{\~n}o-Southern Oscillation (ENSO) dynamics. The records' remarkably high temporal resolution allows to draw links between ENSO and seasonal dynamics, quantified by RA. The final study presented here examines how seasonal predictability could play a role for the stability of agricultural societies. The Classic Maya underwent a period of sociopolitical disintegration that has been linked to drought events. Based on seasonally resolved stable isotope records from Yok Balum cave in Belize, we propose a measure of seasonal predictability. It unveils the potential role declining seasonal predictability could have played in destabilizing agricultural and sociopolitical systems of Classic Maya populations. The methodological approaches and applications presented in this work reveal multiple exciting future research avenues, both for RA and the study of Palaeoseasonality.}, language = {en} } @phdthesis{Wendi2018, author = {Wendi, Dadiyorto}, title = {Recurrence Plots and Quantification Analysis of Flood Runoff Dynamics}, doi = {10.25932/publishup-43191}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-431915}, school = {Universit{\"a}t Potsdam}, pages = {114}, year = {2018}, abstract = {This paper introduces a novel measure to assess similarity between event hydrographs. It is based on Cross Recurrence Plots and Recurrence Quantification Analysis which have recently gained attention in a range of disciplines when dealing with complex systems. The method attempts to quantify the event runoff dynamics and is based on the time delay embedded phase space representation of discharge hydrographs. A phase space trajectory is reconstructed from the event hydrograph, and pairs of hydrographs are compared to each other based on the distance of their phase space trajectories. Time delay embedding allows considering the multi-dimensional relationships between different points in time within the event. Hence, the temporal succession of discharge values is taken into account, such as the impact of the initial conditions on the runoff event. We provide an introduction to Cross Recurrence Plots and discuss their parameterization. An application example based on flood time series demonstrates how the method can be used to measure the similarity or dissimilarity of events, and how it can be used to detect events with rare runoff dynamics. It is argued that this methods provides a more comprehensive approach to quantify hydrograph similarity compared to conventional hydrological signatures.}, language = {en} }