@phdthesis{Perscheid2023,
  author    = {Perscheid, Cindy},
  title     = {Integrative biomarker detection using prior knowledge on gene expression data sets},
  doi       = {10.25932/publishup-58241},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-582418},
  school      = {Universit{\"a}t Potsdam},
  pages     = {ix, 197},
  year      = {2023},
  abstract  = {Gene expression data is analyzed to identify biomarkers, e.g. relevant genes, which serve for diagnostic, predictive, or prognostic use. Traditional approaches for biomarker detection select distinctive features from the data based exclusively on the signals therein, facing multiple shortcomings in regards to overfitting, biomarker robustness, and actual biological relevance. Prior knowledge approaches are expected to address these issues by incorporating prior biological knowledge, e.g. on gene-disease associations, into the actual analysis. However, prior knowledge approaches are currently not widely applied in practice because they are often use-case specific and seldom applicable in a different scope. This leads to a lack of comparability of prior knowledge approaches, which in turn makes it currently impossible to assess their effectiveness in a broader context. Our work addresses the aforementioned issues with three contributions. Our first contribution provides formal definitions for both prior knowledge and the flexible integration thereof into the feature selection process. Central to these concepts is the automatic retrieval of prior knowledge from online knowledge bases, which allows for streamlining the retrieval process and agreeing on a uniform definition for prior knowledge. We subsequently describe novel and generalized prior knowledge approaches that are flexible regarding the used prior knowledge and applicable to varying use case domains. Our second contribution is the benchmarking platform Comprior. Comprior applies the aforementioned concepts in practice and allows for flexibly setting up comprehensive benchmarking studies for examining the performance of existing and novel prior knowledge approaches. It streamlines the retrieval of prior knowledge and allows for combining it with prior knowledge approaches. Comprior demonstrates the practical applicability of our concepts and further fosters the overall development and comparability of prior knowledge approaches. Our third contribution is a comprehensive case study on the effectiveness of prior knowledge approaches. For that, we used Comprior and tested a broad range of both traditional and prior knowledge approaches in combination with multiple knowledge bases on data sets from multiple disease domains. Ultimately, our case study constitutes a thorough assessment of a) the suitability of selected knowledge bases for integration, b) the impact of prior knowledge being applied at different integration levels, and c) the improvements in terms of classification performance, biological relevance, and overall robustness. In summary, our contributions demonstrate that generalized concepts for prior knowledge and a streamlined retrieval process improve the applicability of prior knowledge approaches. Results from our case study show that the integration of prior knowledge positively affects biomarker results, particularly regarding their robustness. Our findings provide the first in-depth insights on the effectiveness of prior knowledge approaches and build a valuable foundation for future research.},
  language  = {en}
}
@article{CopeBaukmannKlingeretal.2021,
  author    = {Cope, Justin L. and Baukmann, Hannes A. and Klinger, J{\"o}rn E. and Ravarani, Charles N. J. and B{\"o}ttinger, Erwin and Konigorski, Stefan and Schmidt, Marco F.},
  title     = {Interaction-based feature selection algorithm outperforms polygenic risk score in predicting Parkinson's Disease status},
  series = {Frontiers in genetics},
  volume    = {12},
  journal   = {Frontiers in genetics},
  publisher = {Frontiers Media},
  address   = {Lausanne},
  issn      = {1664-8021},
  doi       = {10.3389/fgene.2021.744557},
  pages     = {9},
  year      = {2021},
  abstract  = {Polygenic risk scores (PRS) aggregating results from genome-wide association studies are the state of the art in the prediction of susceptibility to complex traits or diseases, yet their predictive performance is limited for various reasons, not least of which is their failure to incorporate the effects of gene-gene interactions. Novel machine learning algorithms that use large amounts of data promise to find gene-gene interactions in order to build models with better predictive performance than PRS. Here, we present a data preprocessing step by using data-mining of contextual information to reduce the number of features, enabling machine learning algorithms to identify gene-gene interactions. We applied our approach to the Parkinson's Progression Markers Initiative (PPMI) dataset, an observational clinical study of 471 genotyped subjects (368 cases and 152 controls). With an AUC of 0.85 (95\% CI = [0.72; 0.96]), the interaction-based prediction model outperforms the PRS (AUC of 0.58 (95\% CI = [0.42; 0.81])). Furthermore, feature importance analysis of the model provided insights into the mechanism of Parkinson's disease. For instance, the model revealed an interaction of previously described drug target candidate genes TMEM175 and GAPDHP25. These results demonstrate that interaction-based machine learning models can improve genetic prediction models and might provide an answer to the missing heritability problem.},
  language  = {en}
}
@article{ErlerRiebeBeitzetal.2023,
  author    = {Erler, Alexander and Riebe, Daniel and Beitz, Toralf and L{\"o}hmannsr{\"o}ben, Hans-Gerd and Leenen, Mathias and P{\"a}tzold, Stefan and Ostermann, Markus and W{\´o}jcik, Michał},
  title     = {Mobile laser-induced breakdown spectroscopy for future application in precision agriculture},
  series = {Sensors},
  volume    = {23},
  journal   = {Sensors},
  number    = {16},
  publisher = {MDPI},
  address   = {Basel},
  issn      = {1424-8220},
  doi       = {10.3390/s23167178},
  pages     = {17},
  year      = {2023},
  abstract  = {In precision agriculture, the estimation of soil parameters via sensors and the creation of nutrient maps are a prerequisite for farmers to take targeted measures such as spatially resolved fertilization. In this work, 68 soil samples uniformly distributed over a field near Bonn are investigated using laser-induced breakdown spectroscopy (LIBS). These investigations include the determination of the total contents of macro- and micronutrients as well as further soil parameters such as soil pH, soil organic matter (SOM) content, and soil texture. The applied LIBS instruments are a handheld and a platform spectrometer, which potentially allows for the single-point measurement and scanning of whole fields, respectively. Their results are compared with a high-resolution lab spectrometer. The prediction of soil parameters was based on multivariate methods. Different feature selection methods and regression methods like PLS, PCR, SVM, Lasso, and Gaussian processes were tested and compared. While good predictions were obtained for Ca, Mg, P, Mn, Cu, and silt content, excellent predictions were obtained for K, Fe, and clay content. The comparison of the three different spectrometers showed that although the lab spectrometer gives the best results, measurements with both field spectrometers also yield good results. This allows for a method transfer to the in-field measurements.},
  language  = {en}
}
@misc{SteinfathGaertnerLisecetal.2009,
  author    = {Steinfath, Matthias and G{\"a}rtner, Tanja and Lisec, Jan and Meyer, Rhonda C. and Altmann, Thomas and Willmitzer, Lothar and Selbig, Joachim},
  title     = {Prediction of hybrid biomass in Arabidopsis thaliana by selected parental SNP and metabolic markers},
  series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Mathematisch-Naturwissenschaftliche Reihe},
  journal   = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Mathematisch-Naturwissenschaftliche Reihe},
  number    = {1324},
  issn      = {1866-8372},
  doi       = {10.25932/publishup-43111},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-431115},
  pages     = {9},
  year      = {2009},
  abstract  = {A recombinant inbred line (RIL) population, derived from two Arabidopsis thaliana accessions, and the corresponding testcrosses with these two original accessions were used for the development and validation of machine learning models to predict the biomass of hybrids. Genetic and metabolic information of the RILs served as predictors. Feature selection reduced the number of variables (genetic and metabolic markers) in the models by more than 80\% without impairing the predictive power. Thus, potential biomarkers have been revealed. Metabolites were shown to bear information on inherited macroscopic phenotypes. This proof of concept could be interesting for breeders. The example population exhibits substantial mid-parent biomass heterosis. The results of feature selection could therefore be used to shed light on the origin of heterosis. In this respect, mainly dominance effects were detected.},
  language  = {en}
}
@article{SteinfathGaertnerLisecetal.2009,
  author    = {Steinfath, Matthias and G{\"a}rtner, Tanja and Lisec, Jan and Meyer, Rhonda Christiane and Altmann, Thomas and Willmitzer, Lothar and Selbig, Joachim},
  title     = {Prediction of hybrid biomass in Arabidopsis thaliana by selected parental SNP and metabolic markers},
  series = {Theoretical and applied genetics : TAG ; international journal of plant breeding research},
  volume    = {120},
  journal   = {Theoretical and applied genetics : TAG ; international journal of plant breeding research},
  publisher = {Springer},
  address   = {Berlin},
  issn      = {0040-5752},
  doi       = {10.1007/s00122-009-1191-2},
  pages     = {239 -- 247},
  year      = {2009},
  abstract  = {A recombinant inbred line (RIL) population, derived from two Arabidopsis thaliana accessions, and the corresponding testcrosses with these two original accessions were used for the development and validation of machine learning models to predict the biomass of hybrids. Genetic and metabolic information of the RILs served as predictors. Feature selection reduced the number of variables (genetic and metabolic markers) in the models by more than 80\% without impairing the predictive power. Thus, potential biomarkers have been revealed. Metabolites were shown to bear information on inherited macroscopic phenotypes. This proof of concept could be interesting for breeders. The example population exhibits substantial mid-parent biomass heterosis. The results of feature selection could therefore be used to shed light on the origin of heterosis. In this respect, mainly dominance effects were detected.},
  language  = {en}
}