@article{RyngajlloChildsLohseetal.2011, author = {Ryngajllo, Malgorzata and Childs, Liam H. and Lohse, Marc and Giorgi, Federico M. and Lude, Anja and Selbig, Joachim and Usadel, Bj{\"o}rn}, title = {SLocX predicting subcellular localization of Arabidopsis proteins leveraging gene expression data}, series = {Frontiers in plant science}, volume = {2}, journal = {Frontiers in plant science}, publisher = {Frontiers Research Foundation}, address = {Lausanne}, issn = {1664-462X}, doi = {10.3389/fpls.2011.00043}, pages = {19}, year = {2011}, abstract = {Despite the growing volume of experimentally validated knowledge about the subcellular localization of plant proteins, a well performing in silico prediction tool is still a necessity. Existing tools, which employ information derived from protein sequence alone, offer limited accuracy and/or rely on full sequence availability. We explored whether gene expression profiling data can be harnessed to enhance prediction performance. To achieve this, we trained several support vector machines to predict the subcellular localization of Arabidopsis thaliana proteins using sequence derived information, expression behavior, or a combination of these data and compared their predictive performance through a cross-validation test. We show that gene expression carries information about the subcellular localization not available in sequence information, yielding dramatic benefits for plastid localization prediction, and some notable improvements for other compartments such as the mito-chondrion, the Golgi, and the plasma membrane. Based on these results, we constructed a novel subcellular localization prediction engine, SLocX, combining gene expression profiling data with protein sequence-based information. We then validated the results of this engine using an independent test set of annotated proteins and a transient expression of GFP fusion proteins. Here, we present the prediction framework and a website of predicted localizations for Arabidopsis. The relatively good accuracy of our prediction engine, even in cases where only partial protein sequence is available (e.g., in sequences lacking the N-terminal region), offers a promising opportunity for similar application to non-sequenced or poorly annotated plant species. Although the prediction scope of our method is currently limited by the availability of expression information on the ATH1 array, we believe that the advances in measuring gene expression technology will make our method applicable for all Arabidopsis proteins.}, language = {en} } @article{KlieNikoloskiSelbig2014, author = {Klie, Sebastian and Nikoloski, Zoran and Selbig, Joachim}, title = {Biological cluster evaluation for gene function prediction}, series = {Journal of computational biology}, volume = {21}, journal = {Journal of computational biology}, number = {6}, publisher = {Liebert}, address = {New Rochelle}, issn = {1066-5277}, doi = {10.1089/cmb.2009.0129}, pages = {428 -- 445}, year = {2014}, abstract = {Recent advances in high-throughput omics techniques render it possible to decode the function of genes by using the "guilt-by-association" principle on biologically meaningful clusters of gene expression data. However, the existing frameworks for biological evaluation of gene clusters are hindered by two bottleneck issues: (1) the choice for the number of clusters, and (2) the external measures which do not take in consideration the structure of the analyzed data and the ontology of the existing biological knowledge. Here, we address the identified bottlenecks by developing a novel framework that allows not only for biological evaluation of gene expression clusters based on existing structured knowledge, but also for prediction of putative gene functions. The proposed framework facilitates propagation of statistical significance at each of the following steps: (1) estimating the number of clusters, (2) evaluating the clusters in terms of novel external structural measures, (3) selecting an optimal clustering algorithm, and (4) predicting gene functions. The framework also includes a method for evaluation of gene clusters based on the structure of the employed ontology. Moreover, our method for obtaining a probabilistic range for the number of clusters is demonstrated valid on synthetic data and available gene expression profiles from Saccharomyces cerevisiae. Finally, we propose a network-based approach for gene function prediction which relies on the clustering of optimal score and the employed ontology. Our approach effectively predicts gene function on the Saccharomyces cerevisiae data set and is also employed to obtain putative gene functions for an Arabidopsis thaliana data set.}, language = {en} }