@article{HempelKoseskaNikoloski2013, author = {Hempel, Sabrina and Koseska, Aneta and Nikoloski, Zoran}, title = {Data-driven reconstruction of directed networks}, series = {The European physical journal : B, Condensed matter and complex systems}, volume = {86}, journal = {The European physical journal : B, Condensed matter and complex systems}, number = {6}, publisher = {Springer}, address = {New York}, issn = {1434-6028}, doi = {10.1140/epjb/e2013-31111-8}, pages = {17}, year = {2013}, abstract = {We investigate the properties of a recently introduced asymmetric association measure, called inner composition alignment (IOTA), aimed at inferring regulatory links (couplings). We show that the measure can be used to determine the direction of coupling, detect superfluous links, and to account for autoregulation. In addition, the measure can be extended to infer the type of regulation (positive or negative). The capabilities of IOTA to correctly infer couplings together with their directionality are compared against Kendall's rank correlation for time series of different lengths, particularly focussing on biological examples. We demonstrate that an extended version of the measure, bidirectional inner composition alignment (biIOTA), increases the accuracy of the network reconstruction for short time series. Finally, we discuss the applicability of the measure to infer couplings in chaotic systems.}, language = {en} } @article{HempelKoseskaNikoloskietal.2011, author = {Hempel, Sabrina and Koseska, Aneta and Nikoloski, Zoran and Kurths, J{\"u}rgen}, title = {Unraveling gene regulatory networks from time-resolved gene expression data - a measures comparison study}, series = {BMC bioinformatics}, volume = {12}, journal = {BMC bioinformatics}, number = {1}, publisher = {BioMed Central}, address = {London}, issn = {1471-2105}, doi = {10.1186/1471-2105-12-292}, pages = {26}, year = {2011}, abstract = {Background: Inferring regulatory interactions between genes from transcriptomics time-resolved data, yielding reverse engineered gene regulatory networks, is of paramount importance to systems biology and bioinformatics studies. Accurate methods to address this problem can ultimately provide a deeper insight into the complexity, behavior, and functions of the underlying biological systems. However, the large number of interacting genes coupled with short and often noisy time-resolved read-outs of the system renders the reverse engineering a challenging task. Therefore, the development and assessment of methods which are computationally efficient, robust against noise, applicable to short time series data, and preferably capable of reconstructing the directionality of the regulatory interactions remains a pressing research problem with valuable applications. Results: Here we perform the largest systematic analysis of a set of similarity measures and scoring schemes within the scope of the relevance network approach which are commonly used for gene regulatory network reconstruction from time series data. In addition, we define and analyze several novel measures and schemes which are particularly suitable for short transcriptomics time series. We also compare the considered 21 measures and 6 scoring schemes according to their ability to correctly reconstruct such networks from short time series data by calculating summary statistics based on the corresponding specificity and sensitivity. Our results demonstrate that rank and symbol based measures have the highest performance in inferring regulatory interactions. In addition, the proposed scoring scheme by asymmetric weighting has shown to be valuable in reducing the number of false positive interactions. On the other hand, Granger causality as well as information-theoretic measures, frequently used in inference of regulatory networks, show low performance on the short time series analyzed in this study. Conclusions: Our study is intended to serve as a guide for choosing a particular combination of similarity measures and scoring schemes suitable for reconstruction of gene regulatory networks from short time series data. We show that further improvement of algorithms for reverse engineering can be obtained if one considers measures that are rooted in the study of symbolic dynamics or ranks, in contrast to the application of common similarity measures which do not consider the temporal character of the employed data. Moreover, we establish that the asymmetric weighting scoring scheme together with symbol based measures (for low noise level) and rank based measures (for high noise level) are the most suitable choices.}, language = {en} } @misc{HempelKoseskaNikoloskietal.2017, author = {Hempel, Sabrina and Koseska, Aneta and Nikoloski, Zoran and Kurths, J{\"u}rgen}, title = {Unraveling gene regulatory networks from time-resolved gene expression data}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-400924}, pages = {26}, year = {2017}, abstract = {Background: Inferring regulatory interactions between genes from transcriptomics time-resolved data, yielding reverse engineered gene regulatory networks, is of paramount importance to systems biology and bioinformatics studies. Accurate methods to address this problem can ultimately provide a deeper insight into the complexity, behavior, and functions of the underlying biological systems. However, the large number of interacting genes coupled with short and often noisy time-resolved read-outs of the system renders the reverse engineering a challenging task. Therefore, the development and assessment of methods which are computationally efficient, robust against noise, applicable to short time series data, and preferably capable of reconstructing the directionality of the regulatory interactions remains a pressing research problem with valuable applications. Results: Here we perform the largest systematic analysis of a set of similarity measures and scoring schemes within the scope of the relevance network approach which are commonly used for gene regulatory network reconstruction from time series data. In addition, we define and analyze several novel measures and schemes which are particularly suitable for short transcriptomics time series. We also compare the considered 21 measures and 6 scoring schemes according to their ability to correctly reconstruct such networks from short time series data by calculating summary statistics based on the corresponding specificity and sensitivity. Our results demonstrate that rank and symbol based measures have the highest performance in inferring regulatory interactions. In addition, the proposed scoring scheme by asymmetric weighting has shown to be valuable in reducing the number of false positive interactions. On the other hand, Granger causality as well as information-theoretic measures, frequently used in inference of regulatory networks, show low performance on the short time series analyzed in this study. Conclusions: Our study is intended to serve as a guide for choosing a particular combination of similarity measures and scoring schemes suitable for reconstruction of gene regulatory networks from short time series data. We show that further improvement of algorithms for reverse engineering can be obtained if one considers measures that are rooted in the study of symbolic dynamics or ranks, in contrast to the application of common similarity measures which do not consider the temporal character of the employed data. Moreover, we establish that the asymmetric weighting scoring scheme together with symbol based measures (for low noise level) and rank based measures (for high noise level) are the most suitable choices.}, language = {en} } @article{WinckArvidssonMauricioRianoPachonetal.2013, author = {Winck, Flavia Vischi and Arvidsson, Samuel Janne and Mauricio Riano-Pachon, Diego and Hempel, Sabrina and Koseska, Aneta and Nikoloski, Zoran and Urbina Gomez, David Alejandro and Rupprecht, Jens and M{\"u}ller-R{\"o}ber, Bernd}, title = {Genome-wide identification of regulatory elements and reconstruction of gene regulatory networks of the green alga chlamydomonas reinhardtii under carbon deprivation}, series = {PLoS one}, volume = {8}, journal = {PLoS one}, number = {11}, publisher = {PLoS}, address = {San Fransisco}, issn = {1932-6203}, doi = {10.1371/journal.pone.0079909}, pages = {16}, year = {2013}, abstract = {The unicellular green alga Chlamydomonas reinhardtii is a long-established model organism for studies on photosynthesis and carbon metabolism-related physiology. Under conditions of air-level carbon dioxide concentration [CO2], a carbon concentrating mechanism (CCM) is induced to facilitate cellular carbon uptake. CCM increases the availability of carbon dioxide at the site of cellular carbon fixation. To improve our understanding of the transcriptional control of the CCM, we employed FAIRE-seq (formaldehyde-assisted Isolation of Regulatory Elements, followed by deep sequencing) to determine nucleosome-depleted chromatin regions of algal cells subjected to carbon deprivation. Our FAIRE data recapitulated the positions of known regulatory elements in the promoter of the periplasmic carbonic anhydrase (Cah1) gene, which is upregulated during CCM induction, and revealed new candidate regulatory elements at a genome-wide scale. In addition, time series expression patterns of 130 transcription factor (TF) and transcription regulator (TR) genes were obtained for cells cultured under photoautotrophic condition and subjected to a shift from high to low [CO2]. Groups of co-expressed genes were identified and a putative directed gene-regulatory network underlying the CCM was reconstructed from the gene expression data using the recently developed IOTA (inner composition alignment) method. Among the candidate regulatory genes, two members of the MYB-related TF family, Lcr1 (Low-CO2 response regulator 1) and Lcr2 (Low-CO2 response regulator 2), may play an important role in down-regulating the expression of a particular set of TF and TR genes in response to low [CO2]. The results obtained provide new insights into the transcriptional control of the CCM and revealed more than 60 new candidate regulatory genes. Deep sequencing of nucleosome-depleted genomic regions indicated the presence of new, previously unknown regulatory elements in the C. reinhardtii genome. Our work can serve as a basis for future functional studies of transcriptional regulator genes and genomic regulatory elements in Chlamydomonas.}, language = {en} } @article{HempelAdolphsLandwehretal.2020, author = {Hempel, Sabrina and Adolphs, Julian and Landwehr, Niels and Janke, David and Amon, Thomas}, title = {How the selection of training data and modeling approach affects the estimation of ammonia emissions from a naturally ventilated dairy barn—classical statistics versus machine learning}, series = {Sustainability}, volume = {12}, journal = {Sustainability}, number = {3}, publisher = {MDPI}, address = {Basel}, issn = {2071-1050}, doi = {10.3390/su12031030}, pages = {18}, year = {2020}, abstract = {Environmental protection efforts can only be effective in the long term with a reliable quantification of pollutant gas emissions as a first step to mitigation. Measurement and analysis strategies must permit the accurate extrapolation of emission values. We systematically analyzed the added value of applying modern machine learning methods in the process of monitoring emissions from naturally ventilated livestock buildings to the atmosphere. We considered almost 40 weeks of hourly emission values from a naturally ventilated dairy cattle barn in Northern Germany. We compared model predictions using 27 different scenarios of temporal sampling, multiple measures of model accuracy, and eight different regression approaches. The error of the predicted emission values with the tested measurement protocols was, on average, well below 20\%. The sensitivity of the prediction to the selected training dataset was worse for the ordinary multilinear regression. Gradient boosting and random forests provided the most accurate and robust emission value predictions, accompanied by the second-smallest model errors. Most of the highly ranked scenarios involved six measurement periods, while the scenario with the best overall performance was: One measurement period in summer and three in the transition periods, each lasting for 14 days.}, language = {en} } @article{HempelAdolphsLandwehretal.2020, author = {Hempel, Sabrina and Adolphs, Julian and Landwehr, Niels and Willink, Dilya and Janke, David and Amon, Thomas}, title = {Supervised machine learning to assess methane emissions of a dairy building with natural ventilation}, series = {Applied Sciences}, volume = {10}, journal = {Applied Sciences}, number = {19}, publisher = {MDPI}, address = {Basel}, issn = {2076-3417}, doi = {10.3390/app10196938}, pages = {21}, year = {2020}, abstract = {A reliable quantification of greenhouse gas emissions is a basis for the development of adequate mitigation measures. Protocols for emission measurements and data analysis approaches to extrapolate to accurate annual emission values are a substantial prerequisite in this context. We systematically analyzed the benefit of supervised machine learning methods to project methane emissions from a naturally ventilated cattle building with a concrete solid floor and manure scraper located in Northern Germany. We took into account approximately 40 weeks of hourly emission measurements and compared model predictions using eight regression approaches, 27 different sampling scenarios and four measures of model accuracy. Data normalization was applied based on median and quartile range. A correlation analysis was performed to evaluate the influence of individual features. This indicated only a very weak linear relation between the methane emission and features that are typically used to predict methane emission values of naturally ventilated barns. It further highlighted the added value of including day-time and squared ambient temperature as features. The error of the predicted emission values was in general below 10\%. The results from Gaussian processes, ordinary multilinear regression and neural networks were least robust. More robust results were obtained with multilinear regression with regularization, support vector machines and particularly the ensemble methods gradient boosting and random forest. The latter had the added value to be rather insensitive against the normalization procedure. In the case of multilinear regression, also the removal of not significantly linearly related variables (i.e., keeping only the day-time component) led to robust modeling results. We concluded that measurement protocols with 7 days and six measurement periods can be considered sufficient to model methane emissions from the dairy barn with solid floor with manure scraper, particularly when periods are distributed over the year with a preference for transition periods. Features should be normalized according to median and quartile range and must be carefully selected depending on the modeling approach.}, language = {en} }