@phdthesis{Zurell2011, author = {Zurell, Damaris}, title = {Integrating dynamic and statistical modelling approaches in order to improve predictions for scenarios of environmental change}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-56845}, school = {Universit{\"a}t Potsdam}, year = {2011}, abstract = {Species respond to environmental change by dynamically adjusting their geographical ranges. Robust predictions of these changes are prerequisites to inform dynamic and sustainable conservation strategies. Correlative species distribution models (SDMs) relate species' occurrence records to prevailing environmental factors to describe the environmental niche. They have been widely applied in global change context as they have comparably low data requirements and allow for rapid assessments of potential future species' distributions. However, due to their static nature, transient responses to environmental change are essentially ignored in SDMs. Furthermore, neither dispersal nor demographic processes and biotic interactions are explicitly incorporated. Therefore, it has often been suggested to link statistical and mechanistic modelling approaches in order to make more realistic predictions of species' distributions for scenarios of environmental change. In this thesis, I present two different ways of such linkage. (i) Mechanistic modelling can act as virtual playground for testing statistical models and allows extensive exploration of specific questions. I promote this 'virtual ecologist' approach as a powerful evaluation framework for testing sampling protocols, analyses and modelling tools. Also, I employ such an approach to systematically assess the effects of transient dynamics and ecological properties and processes on the prediction accuracy of SDMs for climate change projections. That way, relevant mechanisms are identified that shape the species' response to altered environmental conditions and which should hence be considered when trying to project species' distribution through time. (ii) I supplement SDM projections of potential future habitat for black grouse in Switzerland with an individual-based population model. By explicitly considering complex interactions between habitat availability and demographic processes, this allows for a more direct assessment of expected population response to environmental change and associated extinction risks. However, predictions were highly variable across simulations emphasising the need for principal evaluation tools like sensitivity analysis to assess uncertainty and robustness in dynamic range predictions. Furthermore, I identify data coverage of the environmental niche as a likely cause for contrasted range predictions between SDM algorithms. SDMs may fail to make reliable predictions for truncated and edge niches, meaning that portions of the niche are not represented in the data or niche edges coincide with data limits. Overall, my thesis contributes to an improved understanding of uncertainty factors in predictions of range dynamics and presents ways how to deal with these. Finally I provide preliminary guidelines for predictive modelling of dynamic species' response to environmental change, identify key challenges for future research and discuss emerging developments.}, language = {en} } @phdthesis{Bach2013, author = {Bach, Christoph}, title = {Improving statistical seismicity models}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-70591}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {Several mechanisms are proposed to be part of the earthquake triggering process, including static stress interactions and dynamic stress transfer. Significant differences of these mechanisms are particularly expected in the spatial distribution of aftershocks. However, testing the different hypotheses is challenging because it requires the consideration of the large uncertainties involved in stress calculations as well as the appropriate consideration of secondary aftershock triggering which is related to stress changes induced by smaller pre- and aftershocks. In order to evaluate the forecast capability of different mechanisms, I take the effect of smaller--magnitude earthquakes into account by using the epidemic type aftershock sequence (ETAS) model where the spatial probability distribution of direct aftershocks, if available, is correlated to alternative source information and mechanisms. Surface shaking, rupture geometry, and slip distributions are tested. As an approximation of the shaking level, ShakeMaps are used which are available in near real-time after a mainshock and thus could be used for first-order forecasts of the spatial aftershock distribution. Alternatively, the use of empirical decay laws related to minimum fault distance is tested and Coulomb stress change calculations based on published and random slip models. For comparison, the likelihood values of the different model combinations are analyzed in the case of several well-known aftershock sequences (1992 Landers, 1999 Hector Mine, 2004 Parkfield). The tests show that the fault geometry is the most valuable information for improving aftershock forecasts. Furthermore, they reveal that static stress maps can additionally improve the forecasts of off--fault aftershock locations, while the integration of ground shaking data could not upgrade the results significantly. In the second part of this work, I focused on a procedure to test the information content of inverted slip models. This allows to quantify the information gain if this kind of data is included in aftershock forecasts. For this purpose, the ETAS model based on static stress changes, which is introduced in part one, is applied. The forecast ability of the models is systematically tested for several earthquake sequences and compared to models using random slip distributions. The influence of subfault resolution and segment strike and dip is tested. Some of the tested slip models perform very good, in that cases almost no random slip models are found to perform better. Contrastingly, for some of the published slip models, almost all random slip models perform better than the published slip model. Choosing a different subfault resolution hardly influences the result, as long the general slip pattern is still reproducible. Whereas different strike and dip values strongly influence the results depending on the standard deviation chosen, which is applied in the process of randomly selecting the strike and dip values.}, language = {en} } @phdthesis{Haider2013, author = {Haider, Peter}, title = {Prediction with Mixture Models}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-69617}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {Learning a model for the relationship between the attributes and the annotated labels of data examples serves two purposes. Firstly, it enables the prediction of the label for examples without annotation. Secondly, the parameters of the model can provide useful insights into the structure of the data. If the data has an inherent partitioned structure, it is natural to mirror this structure in the model. Such mixture models predict by combining the individual predictions generated by the mixture components which correspond to the partitions in the data. Often the partitioned structure is latent, and has to be inferred when learning the mixture model. Directly evaluating the accuracy of the inferred partition structure is, in many cases, impossible because the ground truth cannot be obtained for comparison. However it can be assessed indirectly by measuring the prediction accuracy of the mixture model that arises from it. This thesis addresses the interplay between the improvement of predictive accuracy by uncovering latent cluster structure in data, and further addresses the validation of the estimated structure by measuring the accuracy of the resulting predictive model. In the application of filtering unsolicited emails, the emails in the training set are latently clustered into advertisement campaigns. Uncovering this latent structure allows filtering of future emails with very low false positive rates. In order to model the cluster structure, a Bayesian clustering model for dependent binary features is developed in this thesis. Knowing the clustering of emails into campaigns can also aid in uncovering which emails have been sent on behalf of the same network of captured hosts, so-called botnets. This association of emails to networks is another layer of latent clustering. Uncovering this latent structure allows service providers to further increase the accuracy of email filtering and to effectively defend against distributed denial-of-service attacks. To this end, a discriminative clustering model is derived in this thesis that is based on the graph of observed emails. The partitionings inferred using this model are evaluated through their capacity to predict the campaigns of new emails. Furthermore, when classifying the content of emails, statistical information about the sending server can be valuable. Learning a model that is able to make use of it requires training data that includes server statistics. In order to also use training data where the server statistics are missing, a model that is a mixture over potentially all substitutions thereof is developed. Another application is to predict the navigation behavior of the users of a website. Here, there is no a priori partitioning of the users into clusters, but to understand different usage scenarios and design different layouts for them, imposing a partitioning is necessary. The presented approach simultaneously optimizes the discriminative as well as the predictive power of the clusters. Each model is evaluated on real-world data and compared to baseline methods. The results show that explicitly modeling the assumptions about the latent cluster structure leads to improved predictions compared to the baselines. It is beneficial to incorporate a small number of hyperparameters that can be tuned to yield the best predictions in cases where the prediction accuracy can not be optimized directly.}, language = {en} } @phdthesis{Prasse2016, author = {Prasse, Paul}, title = {Pattern recognition for computer security}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-100251}, school = {Universit{\"a}t Potsdam}, pages = {VI, 75}, year = {2016}, abstract = {Computer Security deals with the detection and mitigation of threats to computer networks, data, and computing hardware. This thesis addresses the following two computer security problems: email spam campaign and malware detection. Email spam campaigns can easily be generated using popular dissemination tools by specifying simple grammars that serve as message templates. A grammar is disseminated to nodes of a bot net, the nodes create messages by instantiating the grammar at random. Email spam campaigns can encompass huge data volumes and therefore pose a threat to the stability of the infrastructure of email service providers that have to store them. Malware -software that serves a malicious purpose- is affecting web servers, client computers via active content, and client computers through executable files. Without the help of malware detection systems it would be easy for malware creators to collect sensitive information or to infiltrate computers. The detection of threats -such as email-spam messages, phishing messages, or malware- is an adversarial and therefore intrinsically difficult problem. Threats vary greatly and evolve over time. The detection of threats based on manually-designed rules is therefore difficult and requires a constant engineering effort. Machine-learning is a research area that revolves around the analysis of data and the discovery of patterns that describe aspects of the data. Discriminative learning methods extract prediction models from data that are optimized to predict a target attribute as accurately as possible. Machine-learning methods hold the promise of automatically identifying patterns that robustly and accurately detect threats. This thesis focuses on the design and analysis of discriminative learning methods for the two computer-security problems under investigation: email-campaign and malware detection. The first part of this thesis addresses email-campaign detection. We focus on regular expressions as a syntactic framework, because regular expressions are intuitively comprehensible by security engineers and administrators, and they can be applied as a detection mechanism in an extremely efficient manner. In this setting, a prediction model is provided with exemplary messages from an email-spam campaign. The prediction model has to generate a regular expression that reveals the syntactic pattern that underlies the entire campaign, and that a security engineers finds comprehensible and feels confident enough to use the expression to blacklist further messages at the email server. We model this problem as two-stage learning problem with structured input and output spaces which can be solved using standard cutting plane methods. Therefore we develop an appropriate loss function, and derive a decoder for the resulting optimization problem. The second part of this thesis deals with the problem of predicting whether a given JavaScript or PHP file is malicious or benign. Recent malware analysis techniques use static or dynamic features, or both. In fully dynamic analysis, the software or script is executed and observed for malicious behavior in a sandbox environment. By contrast, static analysis is based on features that can be extracted directly from the program file. In order to bypass static detection mechanisms, code obfuscation techniques are used to spread a malicious program file in many different syntactic variants. Deobfuscating the code before applying a static classifier can be subjected to mostly static code analysis and can overcome the problem of obfuscated malicious code, but on the other hand increases the computational costs of malware detection by an order of magnitude. In this thesis we present a cascaded architecture in which a classifier first performs a static analysis of the original code and -based on the outcome of this first classification step- the code may be deobfuscated and classified again. We explore several types of features including token \$n\$-grams, orthogonal sparse bigrams, subroutine-hashings, and syntax-tree features and study the robustness of detection methods and feature types against the evolution of malware over time. The developed tool scans very large file collections quickly and accurately. Each model is evaluated on real-world data and compared to reference methods. Our approach of inferring regular expressions to filter emails belonging to an email spam campaigns leads to models with a high true-positive rate at a very low false-positive rate that is an order of magnitude lower than that of a commercial content-based filter. Our presented system -REx-SVMshort- is being used by a commercial email service provider and complements content-based and IP-address based filtering. Our cascaded malware detection system is evaluated on a high-quality data set of almost 400,000 conspicuous PHP files and a collection of more than 1,00,000 JavaScript files. From our case study we can conclude that our system can quickly and accurately process large data collections at a low false-positive rate.}, language = {en} } @phdthesis{Quade2018, author = {Quade, Markus}, title = {Symbolic regression for identification, prediction, and control of dynamical systems}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-419790}, school = {Universit{\"a}t Potsdam}, pages = {xiii, 134}, year = {2018}, abstract = {In the present work, we use symbolic regression for automated modeling of dynamical systems. Symbolic regression is a powerful and general method suitable for data-driven identification of mathematical expressions. In particular, the structure and parameters of those expressions are identified simultaneously. We consider two main variants of symbolic regression: sparse regression-based and genetic programming-based symbolic regression. Both are applied to identification, prediction and control of dynamical systems. We introduce a new methodology for the data-driven identification of nonlinear dynamics for systems undergoing abrupt changes. Building on a sparse regression algorithm derived earlier, the model after the change is defined as a minimum update with respect to a reference model of the system identified prior to the change. The technique is successfully exemplified on the chaotic Lorenz system and the van der Pol oscillator. Issues such as computational complexity, robustness against noise and requirements with respect to data volume are investigated. We show how symbolic regression can be used for time series prediction. Again, issues such as robustness against noise and convergence rate are investigated us- ing the harmonic oscillator as a toy problem. In combination with embedding, we demonstrate the prediction of a propagating front in coupled FitzHugh-Nagumo oscillators. Additionally, we show how we can enhance numerical weather predictions to commercially forecast power production of green energy power plants. We employ symbolic regression for synchronization control in coupled van der Pol oscillators. Different coupling topologies are investigated. We address issues such as plausibility and stability of the control laws found. The toolkit has been made open source and is used in turbulence control applications. Genetic programming based symbolic regression is very versatile and can be adapted to many optimization problems. The heuristic-based algorithm allows for cost efficient optimization of complex tasks. We emphasize the ability of symbolic regression to yield white-box models. In contrast to black-box models, such models are accessible and interpretable which allows the usage of established tool chains.}, language = {en} } @article{HartmannEhlertFritz2019, author = {Hartmann, Julia and Ehlert, Antje and Fritz, Annemarie}, title = {Welche Rolle spielen sprachliche Parameter f{\"u}r die Entwicklung integrierter verbal-numerischer Konzepte im vierten Lebensjahr?}, series = {Fr{\"u}he Bildung : interdisziplin{\"a}re Zeitschrift f{\"u}r Forschung, Ausbildung und Praxis}, volume = {8}, journal = {Fr{\"u}he Bildung : interdisziplin{\"a}re Zeitschrift f{\"u}r Forschung, Ausbildung und Praxis}, number = {1}, publisher = {Hogrefe}, address = {G{\"o}ttingen}, issn = {2191-9186}, doi = {10.1026/2191-9186/a000410}, pages = {44 -- 52}, year = {2019}, abstract = {Der Beitrag untersucht, ob und zu welchen Anteilen fr{\"u}he sprachliche Kompetenzen numerische Kompetenzen vorhersagen. An 72 dreij{\"a}hrigen Kindern wurden numerische, verbal produktive und rezeptive sowie grammatische Leistungen zwei Mal im Abstand von drei Monaten erhoben. Mithilfe von Strukturgleichungsmodellen kann gezeigt werden, dass sprachliche und numerische Leistungen in diesem Alter noch wenig distinkt sind. F{\"u}r die numerischen Kompetenzen findet sich bereits in diesem Alter eine hohe interindividuelle Entwicklungsstabilit{\"a}t. Ein bedeutsamer Einfluss sprachlicher Kompetenz auf den Zuwachs mathematischer Kompetenz im vierten Lebensjahr konnte nicht nachgewiesen werden. Wir diskutieren die Ergebnisse vor dem Hintergrund der aktuellen Thesen zum Zusammenhang von Sprache und Numerik in der Entwicklung.}, language = {de} } @phdthesis{Elsaid2022, author = {Elsaid, Mohamed Esameldin Mohamed}, title = {Virtual machines live migration cost modeling and prediction}, doi = {10.25932/publishup-54001}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-540013}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 107}, year = {2022}, abstract = {Dynamic resource management is an essential requirement for private and public cloud computing environments. With dynamic resource management, the physical resources assignment to the cloud virtual resources depends on the actual need of the applications or the running services, which enhances the cloud physical resources utilization and reduces the offered services cost. In addition, the virtual resources can be moved across different physical resources in the cloud environment without an obvious impact on the running applications or services production. This means that the availability of the running services and applications in the cloud is independent on the hardware resources including the servers, switches and storage failures. This increases the reliability of using cloud services compared to the classical data-centers environments. In this thesis we briefly discuss the dynamic resource management topic and then deeply focus on live migration as the definition of the compute resource dynamic management. Live migration is a commonly used and an essential feature in cloud and virtual data-centers environments. Cloud computing load balance, power saving and fault tolerance features are all dependent on live migration to optimize the virtual and physical resources usage. As we will discuss in this thesis, live migration shows many benefits to cloud and virtual data-centers environments, however the cost of live migration can not be ignored. Live migration cost includes the migration time, downtime, network overhead, power consumption increases and CPU overhead. IT admins run virtual machines live migrations without an idea about the migration cost. So, resources bottlenecks, higher migration cost and migration failures might happen. The first problem that we discuss in this thesis is how to model the cost of the virtual machines live migration. Secondly, we investigate how to make use of machine learning techniques to help the cloud admins getting an estimation of this cost before initiating the migration for one of multiple virtual machines. Also, we discuss the optimal timing for a specific virtual machine before live migration to another server. Finally, we propose practical solutions that can be used by the cloud admins to be integrated with the cloud administration portals to answer the raised research questions above. Our research methodology to achieve the project objectives is to propose empirical models based on using VMware test-beds with different benchmarks tools. Then we make use of the machine learning techniques to propose a prediction approach for virtual machines live migration cost. Timing optimization for live migration is also proposed in this thesis based on using the cost prediction and data-centers network utilization prediction. Live migration with persistent memory clusters is also discussed at the end of the thesis. The cost prediction and timing optimization techniques proposed in this thesis could be practically integrated with VMware vSphere cluster portal such that the IT admins can now use the cost prediction feature and timing optimization option before proceeding with a virtual machine live migration. Testing results show that our proposed approach for VMs live migration cost prediction shows acceptable results with less than 20\% prediction error and can be easily implemented and integrated with VMware vSphere as an example of a commonly used resource management portal for virtual data-centers and private cloud environments. The results show that using our proposed VMs migration timing optimization technique also could save up to 51\% of migration time of the VMs migration time for memory intensive workloads and up to 27\% of the migration time for network intensive workloads. This timing optimization technique can be useful for network admins to save migration time with utilizing higher network rate and higher probability of success. At the end of this thesis, we discuss the persistent memory technology as a new trend in servers memory technology. Persistent memory modes of operation and configurations are discussed in detail to explain how live migration works between servers with different memory configuration set up. Then, we build a VMware cluster with persistent memory inside server and also with DRAM only servers to show the live migration cost difference between the VMs with DRAM only versus the VMs with persistent memory inside.}, language = {en} } @phdthesis{Sharma2024, author = {Sharma, Shubham}, title = {Integrated approaches to earthquake forecasting}, doi = {10.25932/publishup-63612}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-636125}, school = {Universit{\"a}t Potsdam}, pages = {xvi, 76}, year = {2024}, abstract = {A comprehensive study on seismic hazard and earthquake triggering is crucial for effective mitigation of earthquake risks. The destructive nature of earthquakes motivates researchers to work on forecasting despite the apparent randomness of the earthquake occurrences. Understanding their underlying mechanisms and patterns is vital, given their potential for widespread devastation and loss of life. This thesis combines methodologies, including Coulomb stress calculations and aftershock analysis, to shed light on earthquake complexities, ultimately enhancing seismic hazard assessment. The Coulomb failure stress (CFS) criterion is widely used to predict the spatial distributions of aftershocks following large earthquakes. However, uncertainties associated with CFS calculations arise from non-unique slip inversions and unknown fault networks, particularly due to the choice of the assumed aftershocks (receiver) mechanisms. Recent studies have proposed alternative stress quantities and deep neural network approaches as superior to CFS with predefined receiver mechanisms. To challenge these propositions, I utilized 289 slip inversions from the SRCMOD database to calculate more realistic CFS values for a layered-half space and variable receiver mechanisms. The analysis also investigates the impact of magnitude cutoff, grid size variation, and aftershock duration on the ranking of stress metrics using receiver operating characteristic (ROC) analysis. Results reveal the performance of stress metrics significantly improves after accounting for receiver variability and for larger aftershocks and shorter time periods, without altering the relative ranking of the different stress metrics. To corroborate Coulomb stress calculations with the findings of earthquake source studies in more detail, I studied the source properties of the 2005 Kashmir earthquake and its aftershocks, aiming to unravel the seismotectonics of the NW Himalayan syntaxis. I simultaneously relocated the mainshock and its largest aftershocks using phase data, followed by a comprehensive analysis of Coulomb stress changes on the aftershock planes. By computing the Coulomb failure stress changes on the aftershock faults, I found that all large aftershocks lie in regions of positive stress change, indicating triggering by either co-seismic or post-seismic slip on the mainshock fault. Finally, I investigated the relationship between mainshock-induced stress changes and associated seismicity parameters, in particular those of the frequency-magnitude (Gutenberg-Richter) distribution and the temporal aftershock decay (Omori-Utsu law). For that purpose, I used my global data set of 127 mainshock-aftershock sequences with the calculated Coulomb Stress (ΔCFS) and the alternative receiver-independent stress metrics in the vicinity of the mainshocks and analyzed the aftershocks properties depend on the stress values. Surprisingly, the results show a clear positive correlation between the Gutenberg-Richter b-value and induced stress, contrary to expectations from laboratory experiments. This observation highlights the significance of structural heterogeneity and strength variations in seismicity patterns. Furthermore, the study demonstrates that aftershock productivity increases nonlinearly with stress, while the Omori-Utsu parameters c and p systematically decrease with increasing stress changes. These partly unexpected findings have significant implications for future estimations of aftershock hazard. The findings in this thesis provides valuable insights into earthquake triggering mechanisms by examining the relationship between stress changes and aftershock occurrence. The results contribute to improved understanding of earthquake behavior and can aid in the development of more accurate probabilistic-seismic hazard forecasts and risk reduction strategies.}, language = {en} }