@article{ScholzKaplanGuyetal.2005, author = {Scholz, Matthias and Kaplan, F. and Guy, C. L. and Kopka, Joachim and Selbig, Joachim}, title = {Non-linear PCA : a missing data approach}, issn = {1367-4803}, year = {2005}, abstract = {Motivation: Visualizing and analysing the potential non-linear structure of a dataset is becoming an important task in molecular biology. This is even more challenging when the data have missing values. Results: Here, we propose an inverse model that performs non-linear principal component analysis (NLPCA) from incomplete datasets. Missing values are ignored while optimizing the model, but can be estimated afterwards. Results are shown for both artificial and experimental datasets. In contrast to linear methods, non-linear methods were able to give better missing value estimations for non-linear structured data. Application: We applied this technique to a time course of metabolite data from a cold stress experiment on the model plant Arabidopsis thaliana, and could approximate the mapping function from any time point to the metabolite responses. Thus, the inverse NLPCA provides greatly improved information for better understanding the complex response to cold stress}, language = {en} } @misc{HischeLarhlimiSchwarzetal.2012, author = {Hische, Manuela and Larhlimi, Abdelhalim and Schwarz, Franziska and Fischer-Rosinsk{\´y}, Antje and Bobbert, Thomas and Assmann, Anke and Catchpole, Gareth S. and Pfeiffer, Andreas F. H. and Willmitzer, Lothar and Selbig, Joachim and Spranger, Joachim}, title = {A distinct metabolic signature predictsdevelopment of fasting plasma glucose}, series = {Postprints der Universit{\"a}t Potsdam : Mathematisch Naturwissenschaftliche Reihe}, journal = {Postprints der Universit{\"a}t Potsdam : Mathematisch Naturwissenschaftliche Reihe}, number = {850}, issn = {1866-8372}, doi = {10.25932/publishup-42740}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-427400}, pages = {12}, year = {2012}, abstract = {Background High blood glucose and diabetes are amongst the conditions causing the greatest losses in years of healthy life worldwide. Therefore, numerous studies aim to identify reliable risk markers for development of impaired glucose metabolism and type 2 diabetes. However, the molecular basis of impaired glucose metabolism is so far insufficiently understood. The development of so called 'omics' approaches in the recent years promises to identify molecular markers and to further understand the molecular basis of impaired glucose metabolism and type 2 diabetes. Although univariate statistical approaches are often applied, we demonstrate here that the application of multivariate statistical approaches is highly recommended to fully capture the complexity of data gained using high-throughput methods. Methods We took blood plasma samples from 172 subjects who participated in the prospective Metabolic Syndrome Berlin Potsdam follow-up study (MESY-BEPO Follow-up). We analysed these samples using Gas Chromatography coupled with Mass Spectrometry (GC-MS), and measured 286 metabolites. Furthermore, fasting glucose levels were measured using standard methods at baseline, and after an average of six years. We did correlation analysis and built linear regression models as well as Random Forest regression models to identify metabolites that predict the development of fasting glucose in our cohort. Results We found a metabolic pattern consisting of nine metabolites that predicted fasting glucose development with an accuracy of 0.47 in tenfold cross-validation using Random Forest regression. We also showed that adding established risk markers did not improve the model accuracy. However, external validation is eventually desirable. Although not all metabolites belonging to the final pattern are identified yet, the pattern directs attention to amino acid metabolism, energy metabolism and redox homeostasis. Conclusions We demonstrate that metabolites identified using a high-throughput method (GC-MS) perform well in predicting the development of fasting plasma glucose over several years. Notably, not single, but a complex pattern of metabolites propels the prediction and therefore reflects the complexity of the underlying molecular mechanisms. This result could only be captured by application of multivariate statistical approaches. Therefore, we highly recommend the usage of statistical methods that seize the complexity of the information given by high-throughput methods.}, language = {en} } @article{HischeLuisDominguezPfeifferetal.2010, author = {Hische, Manuela and Luis-Dominguez, Olga and Pfeiffer, Andreas F. H. and Schwarz, Peter E. and Selbig, Joachim and Spranger, Joachim}, title = {Decision trees as a simple-to-use and reliable tool to identify individuals with impaired glucose metabolism or type 2 diabetes mellitus}, issn = {0804-4643}, doi = {10.1530/Eje-10-0649}, year = {2010}, abstract = {Objective: The prevalence of unknown impaired fasting glucose (IFG), impaired glucose tolerance (IGT), or type 2 diabetes mellitus (T2DM) is high. Numerous studies demonstrated that IFG, IGT, or T2DM are associated with increased cardiovascular risk, therefore an improved identification strategy would be desirable. The objective of this study was to create a simple and reliable tool to identify individuals with impaired glucose metabolism (IGM). Design and methods: A cohort of 1737 individuals (1055 controls, 682 with previously unknown IGM) was screened by 75 g oral glucose tolerance test (OGTT). Supervised machine learning was used to automatically generate decision trees to identify individuals with IGM. To evaluate the accuracy of identification, a tenfold cross-validation was performed. Resulting trees were subsequently re-evaluated in a second, independent cohort of 1998 individuals (1253 controls, 745 unknown IGM). Results: A clinical decision tree included age and systolic blood pressure (sensitivity 89.3\%, specificity 37.4\%, and positive predictive value (PPV) 48.0\%), while a tree based on clinical and laboratory data included fasting glucose and systolic blood pressure (sensitivity 89.7\%, specificity 54.6\%, and PPV 56.2\%). The inclusion of additional parameters did not improve test quality. The external validation approach confirmed the presented decision trees. Conclusion: We proposed a simple tool to identify individuals with existing IGM. From a practical perspective, fasting blood glucose and blood pressure measurements should be regularly measured in all individuals presenting in outpatient clinics. An OGTT appears to be useful only if the subjects are older than 48 years or show abnormalities in fasting glucose or blood pressure.}, language = {en} } @article{SteinfathStrehmelPetersetal.2010, author = {Steinfath, Matthias and Strehmel, Nadine and Peters, Rolf and Schauer, Nicolas and Groth, Detlef and Hummel, Jan and Steup, Martin and Selbig, Joachim and Kopka, Joachim and Geigenberger, Peter and Dongen, Joost T. van}, title = {Discovering plant metabolic biomarkers for phenotype prediction using an untargeted approach}, issn = {1467-7644}, doi = {10.1111/j.1467-7652.2010.00516.x}, year = {2010}, abstract = {Biomarkers are used to predict phenotypical properties before these features become apparent and, therefore, are valuable tools for both fundamental and applied research. Diagnostic biomarkers have been discovered in medicine many decades ago and are now commonly applied. While this is routine in the field of medicine, it is of surprise that in agriculture this approach has never been investigated. Up to now, the prediction of phenotypes in plants was based on growing plants and assaying the organs of interest in a time intensive process. For the first time, we demonstrate in this study the application of metabolomics to predict agronomic important phenotypes of a crop plant that was grown in different environments. Our procedure consists of established techniques to screen untargeted for a large amount of metabolites in parallel, in combination with machine learning methods. By using this combination of metabolomics and biomathematical tools metabolites were identified that can be used as biomarkers to improve the prediction of traits. The predictive metabolites can be selected and used subsequently to develop fast, targeted and low-cost diagnostic biomarker assays that can be implemented in breeding programs or quality assessment analysis. The identified metabolic biomarkers allow for the prediction of crop product quality. Furthermore, marker-assisted selection can benefit from the discovery of metabolic biomarkers when other molecular markers come to its limitation. The described marker selection method was developed for potato tubers, but is generally applicable to any crop and trait as it functions independently of genomic information.}, language = {en} } @article{MoehligFloeterSprangeretal.2006, author = {Moehlig, M. and Floeter, A. and Spranger, Joachim and Weickert, Martin O. and Schill, T. and Schloesser, H. W. and Brabant, G. and Pfeiffer, Andreas F. H. and Selbig, Joachim and Schoefl, C.}, title = {Predicting impaired glucose metabolism in women with polycystic ovary syndrome by decision tree modelling}, series = {Diabetologia : journal of the European Association for the Study of Diabetes (EASD)}, volume = {49}, journal = {Diabetologia : journal of the European Association for the Study of Diabetes (EASD)}, publisher = {Springer}, address = {Berlin}, issn = {0012-186X}, doi = {10.1007/s00125-006-0395-0}, pages = {2572 -- 2579}, year = {2006}, abstract = {Aims/hypothesis Polycystic ovary syndrome (PCOS) is a risk factor of type 2 diabetes. Screening for impaired glucose metabolism (IGM) with an OGTT has been recommended, but this is relatively time-consuming and inconvenient. Thus, a strategy that could minimise the need for an OGTT would be beneficial. Materials and methods Consecutive PCOS patients (n=118) with fasting glucose < 6.1 mmol/l were included in the study. Parameters derived from medical history, clinical examination and fasting blood samples were assessed by decision tree modelling for their ability to discriminate women with IGM (2-h OGTT value >= 7.8 mmol/l) from those with NGT. Results According to the OGTT results, 93 PCOS women had NGT and 25 had IGM. The best decision tree consisted of HOMA-IR, the proinsulin:insulin ratio, proinsulin, 17-OH progesterone and the ratio of luteinising hormone:follicle-stimulating hormone. This tree identified 69 women with NGT. The remaining 49 women included all women with IGM (100\% sensitivity, 74\% specificity to detect IGM). Pruning this tree to three levels still identified 53 women with NGT (100\% sensitivity, 57\% specificity to detect IGM). Restricting the data matrix used for tree modelling to medical history and clinical parameters produced a tree using BMI, waist circumference and WHR. Pruning this tree to two levels separated 27 women with NGT (100\% sensitivity, 29\% specificity to detect IGM). The validity of both trees was tested by a leave-10\%-out cross-validation. Conclusions/interpretation Decision trees are useful tools for separating PCOS women with NGT from those with IGM. They can be used for stratifying the metabolic screening of PCOS women, whereby the number of OGTTs can be markedly reduced.}, language = {en} } @article{SteuerHumburgSelbig2006, author = {Steuer, Ralf and Humburg, Peter and Selbig, Joachim}, title = {Validation and functional annotation of expression-based clusters based on gene ontology}, series = {BMC bioinformatics}, volume = {7}, journal = {BMC bioinformatics}, number = {380}, publisher = {BioMed Central}, address = {London}, issn = {1471-2105}, doi = {10.1186/1471-2105-7-380}, pages = {12}, year = {2006}, abstract = {Background: The biological interpretation of large-scale gene expression data is one of the paramount challenges in current bioinformatics. In particular, placing the results in the context of other available functional genomics data, such as existing bio-ontologies, has already provided substantial improvement for detecting and categorizing genes of interest. One common approach is to look for functional annotations that are significantly enriched within a group or cluster of genes, as compared to a reference group. Results: In this work, we suggest the information-theoretic concept of mutual information to investigate the relationship between groups of genes, as given by data-driven clustering, and their respective functional categories. Drawing upon related approaches (Gibbons and Roth, Genome Research 12: 1574-1581, 2002), we seek to quantify to what extent individual attributes are sufficient to characterize a given group or cluster of genes. Conclusion: We show that the mutual information provides a systematic framework to assess the relationship between groups or clusters of genes and their functional annotations in a quantitative way. Within this framework, the mutual information allows us to address and incorporate several important issues, such as the interdependence of functional annotations and combinatorial combinations of attributes. It thus supplements and extends the conventional search for overrepresented attributes within a group or cluster of genes. In particular taking combinations of attributes into account, the mutual information opens the way to uncover specific functional descriptions of a group of genes or clustering result. All datasets and functional annotations used in this study are publicly available. All scripts used in the analysis are provided as additional files.}, language = {en} } @misc{NeigenfindGyetvaiBasekowetal.2008, author = {Neigenfind, Jost and Gyetvai, Gabor and Basekow, Rico and Diehl, Svenja and Achenbach, Ute and Gebhardt, Christiane and Selbig, Joachim and Kersten, Birgit}, title = {Haplotype inference from unphased SNP data in heterozygous polyploids based on SAT}, series = {Postprints der Universit{\"a}t Potsdam : Mathematisch Naturwissenschaftliche Reihe}, journal = {Postprints der Universit{\"a}t Potsdam : Mathematisch Naturwissenschaftliche Reihe}, number = {883}, issn = {1866-8372}, doi = {10.25932/publishup-43501}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-435011}, pages = {28}, year = {2008}, abstract = {Background: Haplotype inference based on unphased SNP markers is an important task in population genetics. Although there are different approaches to the inference of haplotypes in diploid species, the existing software is not suitable for inferring haplotypes from unphased SNP data in polyploid species, such as the cultivated potato (Solanum tuberosum). Potato species are tetraploid and highly heterozygous. Results: Here we present the software SATlotyper which is able to handle polyploid and polyallelic data. SATlo-typer uses the Boolean satisfiability problem to formulate Haplotype Inference by Pure Parsimony. The software excludes existing haplotype inferences, thus allowing for calculation of alternative inferences. As it is not known which of the multiple haplotype inferences are best supported by the given unphased data set, we use a bootstrapping procedure that allows for scoring of alternative inferences. Finally, by means of the bootstrapping scores, it is possible to optimise the phased genotypes belonging to a given haplotype inference. The program is evaluated with simulated and experimental SNP data generated for heterozygous tetraploid populations of potato. We show that, instead of taking the first haplotype inference reported by the program, we can significantly improve the quality of the final result by applying additional methods that include scoring of the alternative haplotype inferences and genotype optimisation. For a sub-population of nineteen individuals, the predicted results computed by SATlotyper were directly compared with results obtained by experimental haplotype inference via sequencing of cloned amplicons. Prediction and experiment gave similar results regarding the inferred haplotypes and phased genotypes. Conclusion: Our results suggest that Haplotype Inference by Pure Parsimony can be solved efficiently by the SAT approach, even for data sets of unphased SNP from heterozygous polyploids. SATlotyper is freeware and is distributed as a Java JAR file. The software can be downloaded from the webpage of the GABI Primary Database at http://www.gabipd.org/projects/satlotyper/. The application of SATlotyper will provide haplotype information, which can be used in haplotype association mapping studies of polyploid plants.}, language = {en} } @article{GrellSchaubSelbig2006, author = {Grell, Susanne and Schaub, Torsten H. and Selbig, Joachim}, title = {Modelling biological networks by action languages via set programming}, issn = {0302-9743}, doi = {10.1007/11799573}, year = {2006}, language = {en} } @article{BeerenwinkelSingLengaueretal.2005, author = {Beerenwinkel, Niko and Sing, Tobias and Lengauer, Thomas and Rahnenfuhrer, Joerg and Roomp, Kirsten and Savenkov, Igor and Fischer, Roman and Hoffmann, Daniel and Selbig, Joachim and Korn, Klaus and Walter, Hauke and Berg, Thomas and Braun, Patrick and Faetkenheuer, Gerd and Oette, Mark and Rockstroh, Juergen and Kupfer, Bernd and Kaiser, Rolf and Daeumer, Martin}, title = {Computational methods for the design of effective therapies against drug resistant HIV strains}, year = {2005}, abstract = {The development of drug resistance is a major obstacle to successful treatment of HIV infection. The extraordinary replication dynamics of HIV facilitates its escape from selective pressure exerted by the human immune system and by combination drug therapy. We have developed several computational methods whose combined use can support the design of optimal antiretroviral therapies based on viral genomic data}, language = {en} } @article{HummelKeshvariWeckwerthetal.2005, author = {Hummel, Jan and Keshvari, N. and Weckwerth, Wolfram and Selbig, Joachim}, title = {Species-specific analysis of protein sequence motifs using mutual information}, issn = {1471-2105}, year = {2005}, abstract = {Background: Protein sequence motifs are by definition short fragments of conserved amino acids, often associated with a specific function. Accordingly protein sequence profiles derived from multiple sequence alignments provide an alternative description of functional motifs characterizing families of related sequences. Such profiles conveniently reflect functional necessities by pointing out proximity at conserved sequence positions as well as depicting distances at variable positions. Discovering significant conservation characteristics within the variable positions of profiles mirrors group-specific and, in particular, evolutionary features of the underlying sequences. Results: We describe the tool PROfile analysis based on Mutual Information (PROMI) that enables comparative analysis of user-classified protein sequences. PROMI is implemented as a web service using Perl and R as well as other publicly available packages and tools on the server-side. On the client-side platform-independence is achieved by generally applied internet delivery standards. As one possible application analysis of the zinc finger C2H2-type protein domain is introduced to illustrate the functionality of the tool. Conclusion: The web service PROMI should assist researchers to detect evolutionary correlations in protein profiles of defined biological sequences. It is available at http:// promi.mpimpgolm. mpg.de where additional documentation can be found}, language = {en} }