@article{PrasseKnaebelMachlicaetal.2019, author = {Prasse, Paul and Knaebel, Rene and Machlica, Lukas and Pevny, Tomas and Scheffer, Tobias}, title = {Joint detection of malicious domains and infected clients}, series = {Machine learning}, volume = {108}, journal = {Machine learning}, number = {8-9}, publisher = {Springer}, address = {Dordrecht}, issn = {0885-6125}, doi = {10.1007/s10994-019-05789-z}, pages = {1353 -- 1368}, year = {2019}, abstract = {Detection of malware-infected computers and detection of malicious web domains based on their encrypted HTTPS traffic are challenging problems, because only addresses, timestamps, and data volumes are observable. The detection problems are coupled, because infected clients tend to interact with malicious domains. Traffic data can be collected at a large scale, and antivirus tools can be used to identify infected clients in retrospect. Domains, by contrast, have to be labeled individually after forensic analysis. We explore transfer learning based on sluice networks; this allows the detection models to bootstrap each other. In a large-scale experimental study, we find that the model outperforms known reference models and detects previously unknown malware, previously unknown malware families, and previously unknown malicious domains.}, language = {en} } @article{PrasseIversenLienhardetal.2022, author = {Prasse, Paul and Iversen, Pascal and Lienhard, Matthias and Thedinga, Kristina and Herwig, Ralf and Scheffer, Tobias}, title = {Pre-Training on In Vitro and Fine-Tuning on Patient-Derived Data Improves Deep Neural Networks for Anti-Cancer Drug-Sensitivity Prediction}, series = {MDPI}, volume = {14}, journal = {MDPI}, edition = {16}, publisher = {MDPI}, address = {Basel, Schweiz}, issn = {2072-6694}, doi = {10.3390/cancers14163950}, pages = {1 -- 14}, year = {2022}, abstract = {Large-scale databases that report the inhibitory capacities of many combinations of candidate drug compounds and cultivated cancer cell lines have driven the development of preclinical drug-sensitivity models based on machine learning. However, cultivated cell lines have devolved from human cancer cells over years or even decades under selective pressure in culture conditions. Moreover, models that have been trained on in vitro data cannot account for interactions with other types of cells. Drug-response data that are based on patient-derived cell cultures, xenografts, and organoids, on the other hand, are not available in the quantities that are needed to train high-capacity machine-learning models. We found that pre-training deep neural network models of drug sensitivity on in vitro drug-sensitivity databases before fine-tuning the model parameters on patient-derived data improves the models' accuracy and improves the biological plausibility of the features, compared to training only on patient-derived data. From our experiments, we can conclude that pre-trained models outperform models that have been trained on the target domains in the vast majority of cases.}, language = {en} } @unpublished{PrasseGrubenMachlikaetal.2016, author = {Prasse, Paul and Gruben, Gerrit and Machlika, Lukas and Pevny, Tomas and Sofka, Michal and Scheffer, Tobias}, title = {Malware Detection by HTTPS Traffic Analysis}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-100942}, pages = {10}, year = {2016}, abstract = {In order to evade detection by network-traffic analysis, a growing proportion of malware uses the encrypted HTTPS protocol. We explore the problem of detecting malware on client computers based on HTTPS traffic analysis. In this setting, malware has to be detected based on the host IP address, ports, timestamp, and data volume information of TCP/IP packets that are sent and received by all the applications on the client. We develop a scalable protocol that allows us to collect network flows of known malicious and benign applications as training data and derive a malware-detection method based on a neural networks and sequence classification. We study the method's ability to detect known and new, unknown malware in a large-scale empirical study.}, language = {en} } @misc{AyzelSchefferHeistermann2020, author = {Ayzel, Georgy and Scheffer, Tobias and Heistermann, Maik}, title = {RainNet v1.0}, series = {Postprints der Universit{\"a}t Potsdam : Mathematisch-Naturwissenschaftliche Reihe}, journal = {Postprints der Universit{\"a}t Potsdam : Mathematisch-Naturwissenschaftliche Reihe}, number = {964}, issn = {1866-8372}, doi = {10.25932/publishup-47294}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-472942}, pages = {16}, year = {2020}, abstract = {In this study, we present RainNet, a deep convolutional neural network for radar-based precipitation nowcasting. Its design was inspired by the U-Net and SegNet families of deep learning models, which were originally designed for binary segmentation tasks. RainNet was trained to predict continuous precipitation intensities at a lead time of 5min, using several years of quality-controlled weather radar composites provided by the German Weather Service (DWD). That data set covers Germany with a spatial domain of 900km × 900km and has a resolution of 1km in space and 5min in time. Independent verification experiments were carried out on 11 summer precipitation events from 2016 to 2017. In order to achieve a lead time of 1h, a recursive approach was implemented by using RainNet predictions at 5min lead times as model inputs for longer lead times. In the verification experiments, trivial Eulerian persistence and a conventional model based on optical flow served as benchmarks. The latter is available in the rainymotion library and had previously been shown to outperform DWD's operational nowcasting model for the same set of verification events. RainNet significantly outperforms the benchmark models at all lead times up to 60min for the routine verification metrics mean absolute error (MAE) and the critical success index (CSI) at intensity thresholds of 0.125, 1, and 5mm h⁻¹. However, rainymotion turned out to be superior in predicting the exceedance of higher intensity thresholds (here 10 and 15mm h⁻¹). The limited ability of RainNet to predict heavy rainfall intensities is an undesirable property which we attribute to a high level of spatial smoothing introduced by the model. At a lead time of 5min, an analysis of power spectral density confirmed a significant loss of spectral power at length scales of 16km and below. Obviously, RainNet had learned an optimal level of smoothing to produce a nowcast at 5min lead time. In that sense, the loss of spectral power at small scales is informative, too, as it reflects the limits of predictability as a function of spatial scale. Beyond the lead time of 5min, however, the increasing level of smoothing is a mere artifact - an analogue to numerical diffusion - that is not a property of RainNet itself but of its recursive application. In the context of early warning, the smoothing is particularly unfavorable since pronounced features of intense precipitation tend to get lost over longer lead times. Hence, we propose several options to address this issue in prospective research, including an adjustment of the loss function for model training, model training for longer lead times, and the prediction of threshold exceedance in terms of a binary segmentation task. Furthermore, we suggest additional input data that could help to better identify situations with imminent precipitation dynamics. The model code, pretrained weights, and training data are provided in open repositories as an input for such future studies.}, language = {en} } @article{AyzelSchefferHeistermann2020, author = {Ayzel, Georgy and Scheffer, Tobias and Heistermann, Maik}, title = {RainNet v1.0}, series = {Geoscientific Model Development}, volume = {13}, journal = {Geoscientific Model Development}, number = {6}, publisher = {Copernicus Publ.}, address = {G{\"o}ttingen}, issn = {1991-959X}, doi = {10.5194/gmd-13-2631-2020}, pages = {2631 -- 2644}, year = {2020}, abstract = {In this study, we present RainNet, a deep convolutional neural network for radar-based precipitation nowcasting. Its design was inspired by the U-Net and SegNet families of deep learning models, which were originally designed for binary segmentation tasks. RainNet was trained to predict continuous precipitation intensities at a lead time of 5min, using several years of quality-controlled weather radar composites provided by the German Weather Service (DWD). That data set covers Germany with a spatial domain of 900km × 900km and has a resolution of 1km in space and 5min in time. Independent verification experiments were carried out on 11 summer precipitation events from 2016 to 2017. In order to achieve a lead time of 1h, a recursive approach was implemented by using RainNet predictions at 5min lead times as model inputs for longer lead times. In the verification experiments, trivial Eulerian persistence and a conventional model based on optical flow served as benchmarks. The latter is available in the rainymotion library and had previously been shown to outperform DWD's operational nowcasting model for the same set of verification events. RainNet significantly outperforms the benchmark models at all lead times up to 60min for the routine verification metrics mean absolute error (MAE) and the critical success index (CSI) at intensity thresholds of 0.125, 1, and 5mm h⁻¹. However, rainymotion turned out to be superior in predicting the exceedance of higher intensity thresholds (here 10 and 15mm h⁻¹). The limited ability of RainNet to predict heavy rainfall intensities is an undesirable property which we attribute to a high level of spatial smoothing introduced by the model. At a lead time of 5min, an analysis of power spectral density confirmed a significant loss of spectral power at length scales of 16km and below. Obviously, RainNet had learned an optimal level of smoothing to produce a nowcast at 5min lead time. In that sense, the loss of spectral power at small scales is informative, too, as it reflects the limits of predictability as a function of spatial scale. Beyond the lead time of 5min, however, the increasing level of smoothing is a mere artifact - an analogue to numerical diffusion - that is not a property of RainNet itself but of its recursive application. In the context of early warning, the smoothing is particularly unfavorable since pronounced features of intense precipitation tend to get lost over longer lead times. Hence, we propose several options to address this issue in prospective research, including an adjustment of the loss function for model training, model training for longer lead times, and the prediction of threshold exceedance in terms of a binary segmentation task. Furthermore, we suggest additional input data that could help to better identify situations with imminent precipitation dynamics. The model code, pretrained weights, and training data are provided in open repositories as an input for such future studies.}, language = {en} } @misc{PrasseIversenLienhardetal.2022, author = {Prasse, Paul and Iversen, Pascal and Lienhard, Matthias and Thedinga, Kristina and Herwig, Ralf and Scheffer, Tobias}, title = {Pre-Training on In Vitro and Fine-Tuning on Patient-Derived Data Improves Deep Neural Networks for Anti-Cancer Drug-Sensitivity Prediction}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Mathematisch-Naturwissenschaftliche Reihe}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Mathematisch-Naturwissenschaftliche Reihe}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, issn = {1866-8372}, doi = {10.25932/publishup-57734}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-577341}, pages = {1 -- 14}, year = {2022}, abstract = {Large-scale databases that report the inhibitory capacities of many combinations of candidate drug compounds and cultivated cancer cell lines have driven the development of preclinical drug-sensitivity models based on machine learning. However, cultivated cell lines have devolved from human cancer cells over years or even decades under selective pressure in culture conditions. Moreover, models that have been trained on in vitro data cannot account for interactions with other types of cells. Drug-response data that are based on patient-derived cell cultures, xenografts, and organoids, on the other hand, are not available in the quantities that are needed to train high-capacity machine-learning models. We found that pre-training deep neural network models of drug sensitivity on in vitro drug-sensitivity databases before fine-tuning the model parameters on patient-derived data improves the models' accuracy and improves the biological plausibility of the features, compared to training only on patient-derived data. From our experiments, we can conclude that pre-trained models outperform models that have been trained on the target domains in the vast majority of cases.}, language = {en} } @article{BickelBruecknerScheffer2009, author = {Bickel, Steffen and Br{\"u}ckner, Michael and Scheffer, Tobias}, title = {Discriminative learning under covariate shift}, issn = {1532-4435}, year = {2009}, abstract = {We address classification problems for which the training instances are governed by an input distribution that is allowed to differ arbitrarily from the test distribution-problems also referred to as classification under covariate shift. We derive a solution that is purely discriminative: neither training nor test distribution are modeled explicitly. The problem of learning under covariate shift can be written as an integrated optimization problem. Instantiating the general optimization problem leads to a kernel logistic regression and an exponential model classifier for covariate shift. The optimization problem is convex under certain conditions; our findings also clarify the relationship to the known kernel mean matching procedure. We report on experiments on problems of spam filtering, text classification, and landmine detection.}, language = {en} } @article{BickelBruecknerScheffer2009, author = {Bickel, Steffen and Brueckner, Michael and Scheffer, Tobias}, title = {Discriminative learning under covariate shift}, issn = {1532-4435}, year = {2009}, abstract = {We address classification problems for which the training instances are governed by an input distribution that is allowed to differ arbitrarily from the test distribution-problems also referred to as classification under covariate shift. We derive a solution that is purely discriminative: neither training nor test distribution are modeled explicitly. The problem of learning under covariate shift can be written as an integrated optimization problem. Instantiating the general optimization problem leads to a kernel logistic regression and an exponential model classifier for covariate shift. The optimization problem is convex under certain conditions; our findings also clarify the relationship to the known kernel mean matching procedure. We report on experiments on problems of spam filtering, text classification, and landmine detection.}, language = {en} } @article{HaiderScheffer2009, author = {Haider, Peter and Scheffer, Tobias}, title = {Bayesian clustering for email campaign detection}, isbn = {978-1-605-58516-1}, year = {2009}, language = {en} } @misc{PatilHaiderPopeetal.2011, author = {Patil, Kaustubh R. and Haider, Peter and Pope, Phillip B. and Turnbaugh, Peter J. and Morrison, Mark and Scheffer, Tobias and McHardy, Alice C.}, title = {Taxonomic metagenome sequence assignment with structured output models}, series = {Nature methods : techniques for life scientists and chemists}, volume = {8}, journal = {Nature methods : techniques for life scientists and chemists}, number = {3}, publisher = {Nature Publ. Group}, address = {London}, issn = {1548-7091}, doi = {10.1038/nmeth0311-191}, pages = {191 -- 192}, year = {2011}, language = {en} }