@article{SteuerHumburgSelbig2006, author = {Steuer, Ralf and Humburg, Peter and Selbig, Joachim}, title = {Validation and functional annotation of expression-based clusters based on gene ontology}, series = {BMC bioinformatics}, volume = {7}, journal = {BMC bioinformatics}, number = {380}, publisher = {BioMed Central}, address = {London}, issn = {1471-2105}, doi = {10.1186/1471-2105-7-380}, pages = {12}, year = {2006}, abstract = {Background: The biological interpretation of large-scale gene expression data is one of the paramount challenges in current bioinformatics. In particular, placing the results in the context of other available functional genomics data, such as existing bio-ontologies, has already provided substantial improvement for detecting and categorizing genes of interest. One common approach is to look for functional annotations that are significantly enriched within a group or cluster of genes, as compared to a reference group. Results: In this work, we suggest the information-theoretic concept of mutual information to investigate the relationship between groups of genes, as given by data-driven clustering, and their respective functional categories. Drawing upon related approaches (Gibbons and Roth, Genome Research 12: 1574-1581, 2002), we seek to quantify to what extent individual attributes are sufficient to characterize a given group or cluster of genes. Conclusion: We show that the mutual information provides a systematic framework to assess the relationship between groups or clusters of genes and their functional annotations in a quantitative way. Within this framework, the mutual information allows us to address and incorporate several important issues, such as the interdependence of functional annotations and combinatorial combinations of attributes. It thus supplements and extends the conventional search for overrepresented attributes within a group or cluster of genes. In particular taking combinations of attributes into account, the mutual information opens the way to uncover specific functional descriptions of a group of genes or clustering result. All datasets and functional annotations used in this study are publicly available. All scripts used in the analysis are provided as additional files.}, language = {en} } @misc{NeigenfindGyetvaiBasekowetal.2008, author = {Neigenfind, Jost and Gyetvai, Gabor and Basekow, Rico and Diehl, Svenja and Achenbach, Ute and Gebhardt, Christiane and Selbig, Joachim and Kersten, Birgit}, title = {Haplotype inference from unphased SNP data in heterozygous polyploids based on SAT}, series = {Postprints der Universit{\"a}t Potsdam : Mathematisch Naturwissenschaftliche Reihe}, journal = {Postprints der Universit{\"a}t Potsdam : Mathematisch Naturwissenschaftliche Reihe}, number = {883}, issn = {1866-8372}, doi = {10.25932/publishup-43501}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-435011}, pages = {28}, year = {2008}, abstract = {Background: Haplotype inference based on unphased SNP markers is an important task in population genetics. Although there are different approaches to the inference of haplotypes in diploid species, the existing software is not suitable for inferring haplotypes from unphased SNP data in polyploid species, such as the cultivated potato (Solanum tuberosum). Potato species are tetraploid and highly heterozygous. Results: Here we present the software SATlotyper which is able to handle polyploid and polyallelic data. SATlo-typer uses the Boolean satisfiability problem to formulate Haplotype Inference by Pure Parsimony. The software excludes existing haplotype inferences, thus allowing for calculation of alternative inferences. As it is not known which of the multiple haplotype inferences are best supported by the given unphased data set, we use a bootstrapping procedure that allows for scoring of alternative inferences. Finally, by means of the bootstrapping scores, it is possible to optimise the phased genotypes belonging to a given haplotype inference. The program is evaluated with simulated and experimental SNP data generated for heterozygous tetraploid populations of potato. We show that, instead of taking the first haplotype inference reported by the program, we can significantly improve the quality of the final result by applying additional methods that include scoring of the alternative haplotype inferences and genotype optimisation. For a sub-population of nineteen individuals, the predicted results computed by SATlotyper were directly compared with results obtained by experimental haplotype inference via sequencing of cloned amplicons. Prediction and experiment gave similar results regarding the inferred haplotypes and phased genotypes. Conclusion: Our results suggest that Haplotype Inference by Pure Parsimony can be solved efficiently by the SAT approach, even for data sets of unphased SNP from heterozygous polyploids. SATlotyper is freeware and is distributed as a Java JAR file. The software can be downloaded from the webpage of the GABI Primary Database at http://www.gabipd.org/projects/satlotyper/. The application of SATlotyper will provide haplotype information, which can be used in haplotype association mapping studies of polyploid plants.}, language = {en} } @article{GrellSchaubSelbig2006, author = {Grell, Susanne and Schaub, Torsten H. and Selbig, Joachim}, title = {Modelling biological networks by action languages via set programming}, issn = {0302-9743}, doi = {10.1007/11799573}, year = {2006}, language = {en} } @article{BeerenwinkelSingLengaueretal.2005, author = {Beerenwinkel, Niko and Sing, Tobias and Lengauer, Thomas and Rahnenfuhrer, Joerg and Roomp, Kirsten and Savenkov, Igor and Fischer, Roman and Hoffmann, Daniel and Selbig, Joachim and Korn, Klaus and Walter, Hauke and Berg, Thomas and Braun, Patrick and Faetkenheuer, Gerd and Oette, Mark and Rockstroh, Juergen and Kupfer, Bernd and Kaiser, Rolf and Daeumer, Martin}, title = {Computational methods for the design of effective therapies against drug resistant HIV strains}, year = {2005}, abstract = {The development of drug resistance is a major obstacle to successful treatment of HIV infection. The extraordinary replication dynamics of HIV facilitates its escape from selective pressure exerted by the human immune system and by combination drug therapy. We have developed several computational methods whose combined use can support the design of optimal antiretroviral therapies based on viral genomic data}, language = {en} } @article{HummelKeshvariWeckwerthetal.2005, author = {Hummel, Jan and Keshvari, N. and Weckwerth, Wolfram and Selbig, Joachim}, title = {Species-specific analysis of protein sequence motifs using mutual information}, issn = {1471-2105}, year = {2005}, abstract = {Background: Protein sequence motifs are by definition short fragments of conserved amino acids, often associated with a specific function. Accordingly protein sequence profiles derived from multiple sequence alignments provide an alternative description of functional motifs characterizing families of related sequences. Such profiles conveniently reflect functional necessities by pointing out proximity at conserved sequence positions as well as depicting distances at variable positions. Discovering significant conservation characteristics within the variable positions of profiles mirrors group-specific and, in particular, evolutionary features of the underlying sequences. Results: We describe the tool PROfile analysis based on Mutual Information (PROMI) that enables comparative analysis of user-classified protein sequences. PROMI is implemented as a web service using Perl and R as well as other publicly available packages and tools on the server-side. On the client-side platform-independence is achieved by generally applied internet delivery standards. As one possible application analysis of the zinc finger C2H2-type protein domain is introduced to illustrate the functionality of the tool. Conclusion: The web service PROMI should assist researchers to detect evolutionary correlations in protein profiles of defined biological sequences. It is available at http:// promi.mpimpgolm. mpg.de where additional documentation can be found}, language = {en} } @article{ScholzKaplanGuyetal.2005, author = {Scholz, Matthias and Kaplan, F. and Guy, C. L. and Kopka, Joachim and Selbig, Joachim}, title = {Non-linear PCA : a missing data approach}, issn = {1367-4803}, year = {2005}, abstract = {Motivation: Visualizing and analysing the potential non-linear structure of a dataset is becoming an important task in molecular biology. This is even more challenging when the data have missing values. Results: Here, we propose an inverse model that performs non-linear principal component analysis (NLPCA) from incomplete datasets. Missing values are ignored while optimizing the model, but can be estimated afterwards. Results are shown for both artificial and experimental datasets. In contrast to linear methods, non-linear methods were able to give better missing value estimations for non-linear structured data. Application: We applied this technique to a time course of metabolite data from a cold stress experiment on the model plant Arabidopsis thaliana, and could approximate the mapping function from any time point to the metabolite responses. Thus, the inverse NLPCA provides greatly improved information for better understanding the complex response to cold stress}, language = {en} } @article{CordesKaiserSelbig2006, author = {Cordes, Frank and Kaiser, Rolf and Selbig, Joachim}, title = {Bioinformatics approach to predicting HIV drug resistance}, issn = {1473-7159}, doi = {10.1586/14737159.6.2.207}, year = {2006}, abstract = {The emergence of drug resistance remains one of the most challenging issues in the treatment of HIV-1 infection. The extreme replication dynamics of HIV facilitates its escape from the selective pressure exerted by the human immune system and by the applied combination drug therapy. This article reviews computational methods whose combined use can support the design of optimal antiretroviral therapies based on viral genotypic and phenotypic data. Genotypic assays are based on the analysis of mutations associated with reduced drug susceptibility, but are difficult to interpret due to the numerous mutations and mutational patterns that confer drug resistance. Phenotypic resistance or susceptibility can be experimentally evaluated by measuring the inhibition of the viral replication in cell culture assays. However, this procedure is expensive and time consuming}, language = {en} } @article{FloeterSelbigSchaub2004, author = {Fl{\"o}ter, Andr{\´e} and Selbig, Joachim and Schaub, Torsten H.}, title = {Finding metabolic pathways in decision forests}, isbn = {3-540-23221-4}, year = {2004}, language = {en} } @article{FloeterNicolasSchaubetal.2004, author = {Fl{\"o}ter, Andr{\´e} and Nicolas, Jacques and Schaub, Torsten H. and Selbig, Joachim}, title = {Threshold extraction in metabolite concentration data}, year = {2004}, abstract = {Motivation: Continued development of analytical techniques based on gas chromatography and mass spectrometry now facilitates the generation of larger sets of metabolite concentration data. An important step towards the understanding of metabolite dynamics is the recognition of stable states where metabolite concentrations exhibit a simple behaviour. Such states can be characterized through the identification of significant thresholds in the concentrations. But general techniques for finding discretization thresholds in continuous data prove to be practically insufficient for detecting states due to the weak conditional dependences in concentration data. Results: We introduce a method of recognizing states in the framework of decision tree induction. It is based upon a global analysis of decision forests where stability and quality are evaluated. It leads to the detection of thresholds that are both comprehensible and robust. Applied to metabolite concentration data, this method has led to the discovery of hidden states in the corresponding variables. Some of these reflect known properties of the biological experiments, and others point to putative new states}, language = {en} } @article{DaubSteuerSelbigetal.2004, author = {Daub, Carsten O. and Steuer, Ralf and Selbig, Joachim and Kloska, Sebastian}, title = {Estimating mutual information using B-spline functions : an improved similarity measure for analysing gene expression data}, issn = {1471-2105}, year = {2004}, abstract = {Background: The information theoretic concept of mutual information provides a general framework to evaluate dependencies between variables. In the context of the clustering of genes with similar patterns of expression it has been suggested as a general quantity of similarity to extend commonly used linear measures. Since mutual information is defined in terms of discrete variables, its application to continuous data requires the use of binning procedures, which can lead to significant numerical errors for datasets of small or moderate size. Results: In this work, we propose a method for the numerical estimation of mutual information from continuous data. We investigate the characteristic properties arising from the application of our algorithm and show that our approach outperforms commonly used algorithms: The significance, as a measure of the power of distinction from random correlation, is significantly increased. This concept is subsequently illustrated on two large-scale gene expression datasets and the results are compared to those obtained using other similarity measures. A C++ source code of our algorithm is available for non- commercial use from kloska@scienion.de upon request. Conclusion: The utilisation of mutual information as similarity measure enables the detection of non-linear correlations in gene expression datasets. Frequently applied linear correlation measures, which are often used on an ad-hoc basis without further justification, are thereby extended}, language = {en} }