@phdthesis{Zhelavskaya2020, author = {Zhelavskaya, Irina}, title = {Modeling of the Plasmasphere Dynamics}, doi = {10.25932/publishup-48243}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-482433}, school = {Universit{\"a}t Potsdam}, pages = {xlii, 256}, year = {2020}, abstract = {The plasmasphere is a dynamic region of cold, dense plasma surrounding the Earth. Its shape and size are highly susceptible to variations in solar and geomagnetic conditions. Having an accurate model of plasma density in the plasmasphere is important for GNSS navigation and for predicting hazardous effects of radiation in space on spacecraft. The distribution of cold plasma and its dynamic dependence on solar wind and geomagnetic conditions remain, however, poorly quantified. Existing empirical models of plasma density tend to be oversimplified as they are based on statistical averages over static parameters. Understanding the global dynamics of the plasmasphere using observations from space remains a challenge, as existing density measurements are sparse and limited to locations where satellites can provide in-situ observations. In this dissertation, we demonstrate how such sparse electron density measurements can be used to reconstruct the global electron density distribution in the plasmasphere and capture its dynamic dependence on solar wind and geomagnetic conditions. First, we develop an automated algorithm to determine the electron density from in-situ measurements of the electric field on the Van Allen Probes spacecraft. In particular, we design a neural network to infer the upper hybrid resonance frequency from the dynamic spectrograms obtained with the Electric and Magnetic Field Instrument Suite and Integrated Science (EMFISIS) instrumentation suite, which is then used to calculate the electron number density. The developed Neural-network-based Upper hybrid Resonance Determination (NURD) algorithm is applied to more than four years of EMFISIS measurements to produce the publicly available electron density data set. We utilize the obtained electron density data set to develop a new global model of plasma density by employing a neural network-based modeling approach. In addition to the location, the model takes the time history of geomagnetic indices and location as inputs, and produces electron density in the equatorial plane as an output. It is extensively validated using in-situ density measurements from the Van Allen Probes mission, and also by comparing the predicted global evolution of the plasmasphere with the global IMAGE EUV images of He+ distribution. The model successfully reproduces erosion of the plasmasphere on the night side as well as plume formation and evolution, and agrees well with data. The performance of neural networks strongly depends on the availability of training data, which is limited during intervals of high geomagnetic activity. In order to provide reliable density predictions during such intervals, we can employ physics-based modeling. We develop a new approach for optimally combining the neural network- and physics-based models of the plasmasphere by means of data assimilation. The developed approach utilizes advantages of both neural network- and physics-based modeling and produces reliable global plasma density reconstructions for quiet, disturbed, and extreme geomagnetic conditions. Finally, we extend the developed machine learning-based tools and apply them to another important problem in the field of space weather, the prediction of the geomagnetic index Kp. The Kp index is one of the most widely used indicators for space weather alerts and serves as input to various models, such as for the thermosphere, the radiation belts and the plasmasphere. It is therefore crucial to predict the Kp index accurately. Previous work in this area has mostly employed artificial neural networks to nowcast and make short-term predictions of Kp, basing their inferences on the recent history of Kp and solar wind measurements at L1. We analyze how the performance of neural networks compares to other machine learning algorithms for nowcasting and forecasting Kp for up to 12 hours ahead. Additionally, we investigate several machine learning and information theory methods for selecting the optimal inputs to a predictive model of Kp. The developed tools for feature selection can also be applied to other problems in space physics in order to reduce the input dimensionality and identify the most important drivers. Research outlined in this dissertation clearly demonstrates that machine learning tools can be used to develop empirical models from sparse data and also can be used to understand the underlying physical processes. Combining machine learning, physics-based modeling and data assimilation allows us to develop novel methods benefiting from these different approaches.}, language = {en} } @phdthesis{Risch2020, author = {Risch, Julian}, title = {Reader comment analysis on online news platforms}, doi = {10.25932/publishup-48922}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-489222}, school = {Universit{\"a}t Potsdam}, pages = {xi, 135}, year = {2020}, abstract = {Comment sections of online news platforms are an essential space to express opinions and discuss political topics. However, the misuse by spammers, haters, and trolls raises doubts about whether the benefits justify the costs of the time-consuming content moderation. As a consequence, many platforms limited or even shut down comment sections completely. In this thesis, we present deep learning approaches for comment classification, recommendation, and prediction to foster respectful and engaging online discussions. The main focus is on two kinds of comments: toxic comments, which make readers leave a discussion, and engaging comments, which make readers join a discussion. First, we discourage and remove toxic comments, e.g., insults or threats. To this end, we present a semi-automatic comment moderation process, which is based on fine-grained text classification models and supports moderators. Our experiments demonstrate that data augmentation, transfer learning, and ensemble learning allow training robust classifiers even on small datasets. To establish trust in the machine-learned models, we reveal which input features are decisive for their output with attribution-based explanation methods. Second, we encourage and highlight engaging comments, e.g., serious questions or factual statements. We automatically identify the most engaging comments, so that readers need not scroll through thousands of comments to find them. The model training process builds on upvotes and replies as a measure of reader engagement. We also identify comments that address the article authors or are otherwise relevant to them to support interactions between journalists and their readership. Taking into account the readers' interests, we further provide personalized recommendations of discussions that align with their favored topics or involve frequent co-commenters. Our models outperform multiple baselines and recent related work in experiments on comment datasets from different platforms.}, language = {en} } @phdthesis{Koumarelas2020, author = {Koumarelas, Ioannis}, title = {Data preparation and domain-agnostic duplicate detection}, doi = {10.25932/publishup-48913}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-489131}, school = {Universit{\"a}t Potsdam}, pages = {x, 97}, year = {2020}, abstract = {Successfully completing any data science project demands careful consideration across its whole process. Although the focus is often put on later phases of the process, in practice, experts spend more time in earlier phases, preparing data, to make them consistent with the systems' requirements or to improve their models' accuracies. Duplicate detection is typically applied during the data cleaning phase, which is dedicated to removing data inconsistencies and improving the overall quality and usability of data. While data cleaning involves a plethora of approaches to perform specific operations, such as schema alignment and data normalization, the task of detecting and removing duplicate records is particularly challenging. Duplicates arise when multiple records representing the same entities exist in a database. Due to numerous reasons, spanning from simple typographical errors to different schemas and formats of integrated databases. Keeping a database free of duplicates is crucial for most use-cases, as their existence causes false negatives and false positives when matching queries against it. These two data quality issues have negative implications for tasks, such as hotel booking, where users may erroneously select a wrong hotel, or parcel delivery, where a parcel can get delivered to the wrong address. Identifying the variety of possible data issues to eliminate duplicates demands sophisticated approaches. While research in duplicate detection is well-established and covers different aspects of both efficiency and effectiveness, our work in this thesis focuses on the latter. We propose novel approaches to improve data quality before duplicate detection takes place and apply the latter in datasets even when prior labeling is not available. Our experiments show that improving data quality upfront can increase duplicate classification results by up to 19\%. To this end, we propose two novel pipelines that select and apply generic as well as address-specific data preparation steps with the purpose of maximizing the success of duplicate detection. Generic data preparation, such as the removal of special characters, can be applied to any relation with alphanumeric attributes. When applied, data preparation steps are selected only for attributes where there are positive effects on pair similarities, which indirectly affect classification, or on classification directly. Our work on addresses is twofold; first, we consider more domain-specific approaches to improve the quality of values, and, second, we experiment with known and modified versions of similarity measures to select the most appropriate per address attribute, e.g., city or country. To facilitate duplicate detection in applications where gold standard annotations are not available and obtaining them is not possible or too expensive, we propose MDedup. MDedup is a novel, rule-based, and fully automatic duplicate detection approach that is based on matching dependencies. These dependencies can be used to detect duplicates and can be discovered using state-of-the-art algorithms efficiently and without any prior labeling. MDedup uses two pipelines to first train on datasets with known labels, learning to identify useful matching dependencies, and then be applied on unseen datasets, regardless of any existing gold standard. Finally, our work is accompanied by open source code to enable repeatability of our research results and application of our approaches to other datasets.}, language = {en} }