@misc{PerscheidFaberKrausetal.2018, author = {Perscheid, Cindy and Faber, Lukas and Kraus, Milena and Arndt, Paul and Janke, Michael and Rehfeldt, Sebastian and Schubotz, Antje and Slosarek, Tamara and Uflacker, Matthias}, title = {A tissue-aware gene selection approach for analyzing multi-tissue gene expression data}, series = {2018 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)}, journal = {2018 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)}, publisher = {IEEE}, address = {New York}, isbn = {978-1-5386-5488-0}, issn = {2156-1125}, doi = {10.1109/BIBM.2018.8621189}, pages = {2159 -- 2166}, year = {2018}, abstract = {High-throughput RNA sequencing (RNAseq) produces large data sets containing expression levels of thousands of genes. The analysis of RNAseq data leads to a better understanding of gene functions and interactions, which eventually helps to study diseases like cancer and develop effective treatments. Large-scale RNAseq expression studies on cancer comprise samples from multiple cancer types and aim to identify their distinct molecular characteristics. Analyzing samples from different cancer types implies analyzing samples from different tissue origin. Such multi-tissue RNAseq data sets require a meaningful analysis that accounts for the inherent tissue-related bias: The identified characteristics must not originate from the differences in tissue types, but from the actual differences in cancer types. However, current analysis procedures do not incorporate that aspect. As a result, we propose to integrate a tissue-awareness into the analysis of multi-tissue RNAseq data. We introduce an extension for gene selection that provides a tissue-wise context for every gene and can be flexibly combined with any existing gene selection approach. We suggest to expand conventional evaluation by additional metrics that are sensitive to the tissue-related bias. Evaluations show that especially low complexity gene selection approaches profit from introducing tissue-awareness.}, language = {en} } @article{Perscheid2021, author = {Perscheid, Cindy}, title = {Comprior}, series = {BMC Bioinformatics}, volume = {22}, journal = {BMC Bioinformatics}, publisher = {Springer Nature}, address = {London}, issn = {1471-2105}, doi = {10.1186/s12859-021-04308-z}, pages = {1 -- 15}, year = {2021}, abstract = {Background Reproducible benchmarking is important for assessing the effectiveness of novel feature selection approaches applied on gene expression data, especially for prior knowledge approaches that incorporate biological information from online knowledge bases. However, no full-fledged benchmarking system exists that is extensible, provides built-in feature selection approaches, and a comprehensive result assessment encompassing classification performance, robustness, and biological relevance. Moreover, the particular needs of prior knowledge feature selection approaches, i.e. uniform access to knowledge bases, are not addressed. As a consequence, prior knowledge approaches are not evaluated amongst each other, leaving open questions regarding their effectiveness. Results We present the Comprior benchmark tool, which facilitates the rapid development and effortless benchmarking of feature selection approaches, with a special focus on prior knowledge approaches. Comprior is extensible by custom approaches, offers built-in standard feature selection approaches, enables uniform access to multiple knowledge bases, and provides a customizable evaluation infrastructure to compare multiple feature selection approaches regarding their classification performance, robustness, runtime, and biological relevance. Conclusion Comprior allows reproducible benchmarking especially of prior knowledge approaches, which facilitates their applicability and for the first time enables a comprehensive assessment of their effectiveness}, language = {en} } @misc{Perscheid2021, author = {Perscheid, Cindy}, title = {Comprior: facilitating the implementation and automated benchmarking of prior knowledge-based feature selection approaches on gene expression data sets}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-54894}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-548943}, pages = {1 -- 15}, year = {2021}, abstract = {Background Reproducible benchmarking is important for assessing the effectiveness of novel feature selection approaches applied on gene expression data, especially for prior knowledge approaches that incorporate biological information from online knowledge bases. However, no full-fledged benchmarking system exists that is extensible, provides built-in feature selection approaches, and a comprehensive result assessment encompassing classification performance, robustness, and biological relevance. Moreover, the particular needs of prior knowledge feature selection approaches, i.e. uniform access to knowledge bases, are not addressed. As a consequence, prior knowledge approaches are not evaluated amongst each other, leaving open questions regarding their effectiveness. Results We present the Comprior benchmark tool, which facilitates the rapid development and effortless benchmarking of feature selection approaches, with a special focus on prior knowledge approaches. Comprior is extensible by custom approaches, offers built-in standard feature selection approaches, enables uniform access to multiple knowledge bases, and provides a customizable evaluation infrastructure to compare multiple feature selection approaches regarding their classification performance, robustness, runtime, and biological relevance. Conclusion Comprior allows reproducible benchmarking especially of prior knowledge approaches, which facilitates their applicability and for the first time enables a comprehensive assessment of their effectiveness}, language = {en} } @inproceedings{KurbelNowakAzodietal.2015, author = {Kurbel, Karl and Nowak, Dawid and Azodi, Amir and Jaeger, David and Meinel, Christoph and Cheng, Feng and Sapegin, Andrey and Gawron, Marian and Morelli, Frank and Stahl, Lukas and Kerl, Stefan and Janz, Mariska and Hadaya, Abdulmasih and Ivanov, Ivaylo and Wiese, Lena and Neves, Mariana and Schapranow, Matthieu-Patrick and F{\"a}hnrich, Cindy and Feinbube, Frank and Eberhardt, Felix and Hagen, Wieland and Plauth, Max and Herscheid, Lena and Polze, Andreas and Barkowsky, Matthias and Dinger, Henriette and Faber, Lukas and Montenegro, Felix and Czach{\´o}rski, Tadeusz and Nycz, Monika and Nycz, Tomasz and Baader, Galina and Besner, Veronika and Hecht, Sonja and Schermann, Michael and Krcmar, Helmut and Wiradarma, Timur Pratama and Hentschel, Christian and Sack, Harald and Abramowicz, Witold and Sokolowska, Wioletta and Hossa, Tymoteusz and Opalka, Jakub and Fabisz, Karol and Kubaczyk, Mateusz and Cmil, Milena and Meng, Tianhui and Dadashnia, Sharam and Niesen, Tim and Fettke, Peter and Loos, Peter and Perscheid, Cindy and Schwarz, Christian and Schmidt, Christopher and Scholz, Matthias and Bock, Nikolai and Piller, Gunther and B{\"o}hm, Klaus and Norkus, Oliver and Clark, Brian and Friedrich, Bj{\"o}rn and Izadpanah, Babak and Merkel, Florian and Schweer, Ilias and Zimak, Alexander and Sauer, J{\"u}rgen and Fabian, Benjamin and Tilch, Georg and M{\"u}ller, David and Pl{\"o}ger, Sabrina and Friedrich, Christoph M. and Engels, Christoph and Amirkhanyan, Aragats and van der Walt, Est{\´e}e and Eloff, J. H. P. and Scheuermann, Bernd and Weinknecht, Elisa}, title = {HPI Future SOC Lab}, editor = {Meinel, Christoph and Polze, Andreas and Oswald, Gerhard and Strotmann, Rolf and Seibold, Ulrich and Schulzki, Bernhard}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-102516}, pages = {iii, 154}, year = {2015}, abstract = {Das Future SOC Lab am HPI ist eine Kooperation des Hasso-Plattner-Instituts mit verschiedenen Industriepartnern. Seine Aufgabe ist die Erm{\"o}glichung und F{\"o}rderung des Austausches zwischen Forschungsgemeinschaft und Industrie. Am Lab wird interessierten Wissenschaftlern eine Infrastruktur von neuester Hard- und Software kostenfrei f{\"u}r Forschungszwecke zur Verf{\"u}gung gestellt. Dazu z{\"a}hlen teilweise noch nicht am Markt verf{\"u}gbare Technologien, die im normalen Hochschulbereich in der Regel nicht zu finanzieren w{\"a}ren, bspw. Server mit bis zu 64 Cores und 2 TB Hauptspeicher. Diese Angebote richten sich insbesondere an Wissenschaftler in den Gebieten Informatik und Wirtschaftsinformatik. Einige der Schwerpunkte sind Cloud Computing, Parallelisierung und In-Memory Technologien. In diesem Technischen Bericht werden die Ergebnisse der Forschungsprojekte des Jahres 2015 vorgestellt. Ausgew{\"a}hlte Projekte stellten ihre Ergebnisse am 15. April 2015 und 4. November 2015 im Rahmen der Future SOC Lab Tag Veranstaltungen vor.}, language = {en} } @misc{PerscheidUflacker2019, author = {Perscheid, Cindy and Uflacker, Matthias}, title = {Integrating Biological Context into the Analysis of Gene Expression Data}, series = {Distributed Computing and Artificial Intelligence, Special Sessions, 15th International Conference}, volume = {801}, journal = {Distributed Computing and Artificial Intelligence, Special Sessions, 15th International Conference}, publisher = {Springer}, address = {Cham}, isbn = {978-3-319-99608-0}, issn = {2194-5357}, doi = {10.1007/978-3-319-99608-0_41}, pages = {339 -- 343}, year = {2019}, abstract = {High-throughput RNA sequencing produces large gene expression datasets whose analysis leads to a better understanding of diseases like cancer. The nature of RNA-Seq data poses challenges to its analysis in terms of its high dimensionality, noise, and complexity of the underlying biological processes. Researchers apply traditional machine learning approaches, e. g. hierarchical clustering, to analyze this data. Until it comes to validation of the results, the analysis is based on the provided data only and completely misses the biological context. However, gene expression data follows particular patterns - the underlying biological processes. In our research, we aim to integrate the available biological knowledge earlier in the analysis process. We want to adapt state-of-the-art data mining algorithms to consider the biological context in their computations and deliver meaningful results for researchers.}, language = {en} } @article{Perscheid2021, author = {Perscheid, Cindy}, title = {Integrative biomarker detection on high-dimensional gene expression data sets}, series = {Briefings in bioinformatics}, volume = {22}, journal = {Briefings in bioinformatics}, number = {3}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1467-5463}, doi = {10.1093/bib/bbaa151}, pages = {18}, year = {2021}, abstract = {Gene expression data provide the expression levels of tens of thousands of genes from several hundred samples. These data are analyzed to detect biomarkers that can be of prognostic or diagnostic use. Traditionally, biomarker detection for gene expression data is the task of gene selection. The vast number of genes is reduced to a few relevant ones that achieve the best performance for the respective use case. Traditional approaches select genes based on their statistical significance in the data set. This results in issues of robustness, redundancy and true biological relevance of the selected genes. Integrative analyses typically address these shortcomings by integrating multiple data artifacts from the same objects, e.g. gene expression and methylation data. When only gene expression data are available, integrative analyses instead use curated information on biological processes from public knowledge bases. With knowledge bases providing an ever-increasing amount of curated biological knowledge, such prior knowledge approaches become more powerful. This paper provides a thorough overview on the status quo of biomarker detection on gene expression data with prior biological knowledge. We discuss current shortcomings of traditional approaches, review recent external knowledge bases, provide a classification and qualitative comparison of existing prior knowledge approaches and discuss open challenges for this kind of gene selection.}, language = {en} } @phdthesis{Perscheid2023, author = {Perscheid, Cindy}, title = {Integrative biomarker detection using prior knowledge on gene expression data sets}, doi = {10.25932/publishup-58241}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-582418}, school = {Universit{\"a}t Potsdam}, pages = {ix, 197}, year = {2023}, abstract = {Gene expression data is analyzed to identify biomarkers, e.g. relevant genes, which serve for diagnostic, predictive, or prognostic use. Traditional approaches for biomarker detection select distinctive features from the data based exclusively on the signals therein, facing multiple shortcomings in regards to overfitting, biomarker robustness, and actual biological relevance. Prior knowledge approaches are expected to address these issues by incorporating prior biological knowledge, e.g. on gene-disease associations, into the actual analysis. However, prior knowledge approaches are currently not widely applied in practice because they are often use-case specific and seldom applicable in a different scope. This leads to a lack of comparability of prior knowledge approaches, which in turn makes it currently impossible to assess their effectiveness in a broader context. Our work addresses the aforementioned issues with three contributions. Our first contribution provides formal definitions for both prior knowledge and the flexible integration thereof into the feature selection process. Central to these concepts is the automatic retrieval of prior knowledge from online knowledge bases, which allows for streamlining the retrieval process and agreeing on a uniform definition for prior knowledge. We subsequently describe novel and generalized prior knowledge approaches that are flexible regarding the used prior knowledge and applicable to varying use case domains. Our second contribution is the benchmarking platform Comprior. Comprior applies the aforementioned concepts in practice and allows for flexibly setting up comprehensive benchmarking studies for examining the performance of existing and novel prior knowledge approaches. It streamlines the retrieval of prior knowledge and allows for combining it with prior knowledge approaches. Comprior demonstrates the practical applicability of our concepts and further fosters the overall development and comparability of prior knowledge approaches. Our third contribution is a comprehensive case study on the effectiveness of prior knowledge approaches. For that, we used Comprior and tested a broad range of both traditional and prior knowledge approaches in combination with multiple knowledge bases on data sets from multiple disease domains. Ultimately, our case study constitutes a thorough assessment of a) the suitability of selected knowledge bases for integration, b) the impact of prior knowledge being applied at different integration levels, and c) the improvements in terms of classification performance, biological relevance, and overall robustness. In summary, our contributions demonstrate that generalized concepts for prior knowledge and a streamlined retrieval process improve the applicability of prior knowledge approaches. Results from our case study show that the integration of prior knowledge positively affects biomarker results, particularly regarding their robustness. Our findings provide the first in-depth insights on the effectiveness of prior knowledge approaches and build a valuable foundation for future research.}, language = {en} } @article{PerscheidGrasnickUflacker2019, author = {Perscheid, Cindy and Grasnick, Bastien and Uflacker, Matthias}, title = {Integrative Gene Selection on Gene Expression Data}, series = {Journal of Integrative Bioinformatics}, volume = {16}, journal = {Journal of Integrative Bioinformatics}, number = {1}, publisher = {De Gruyter}, address = {Berlin}, issn = {1613-4516}, doi = {10.1515/jib-2018-0064}, pages = {17}, year = {2019}, abstract = {The advance of high-throughput RNA-Sequencing techniques enables researchers to analyze the complete gene activity in particular cells. From the insights of such analyses, researchers can identify disease-specific expression profiles, thus understand complex diseases like cancer, and eventually develop effective measures for diagnosis and treatment. The high dimensionality of gene expression data poses challenges to its computational analysis, which is addressed with measures of gene selection. Traditional gene selection approaches base their findings on statistical analyses of the actual expression levels, which implies several drawbacks when it comes to accurately identifying the underlying biological processes. In turn, integrative approaches include curated information on biological processes from external knowledge bases during gene selection, which promises to lead to better interpretability and improved predictive performance. Our work compares the performance of traditional and integrative gene selection approaches. Moreover, we propose a straightforward approach to integrate external knowledge with traditional gene selection approaches. We introduce a framework enabling the automatic external knowledge integration, gene selection, and evaluation. Evaluation results prove our framework to be a useful tool for evaluation and show that integration of external knowledge improves overall analysis results.}, language = {en} }