@article{AdnanSrsicVenticichetal.2020, author = {Adnan, Hassan Sami and Srsic, Amanda and Venticich, Pete Milos and Townend, David M.R.}, title = {Using AI for mental health analysis and prediction in school surveys}, series = {European journal of public health}, volume = {30}, journal = {European journal of public health}, publisher = {Oxford Univ. Press}, address = {Oxford [u.a.]}, issn = {1101-1262}, doi = {10.1093/eurpub/ckaa165.336}, pages = {V125 -- V125}, year = {2020}, abstract = {Background: Childhood and adolescence are critical stages of life for mental health and well-being. Schools are a key setting for mental health promotion and illness prevention. One in five children and adolescents have a mental disorder, about half of mental disorders beginning before the age of 14. Beneficial and explainable artificial intelligence can replace current paper- based and online approaches to school mental health surveys. This can enhance data acquisition, interoperability, data driven analysis, trust and compliance. This paper presents a model for using chatbots for non-obtrusive data collection and supervised machine learning models for data analysis; and discusses ethical considerations pertaining to the use of these models. Methods: For data acquisition, the proposed model uses chatbots which interact with students. The conversation log acts as the source of raw data for the machine learning. Pre-processing of the data is automated by filtering for keywords and phrases. Existing survey results, obtained through current paper-based data collection methods, are evaluated by domain experts (health professionals). These can be used to create a test dataset to validate the machine learning models. Supervised learning can then be deployed to classify specific behaviour and mental health patterns. Results: We present a model that can be used to improve upon current paper-based data collection and manual data analysis methods. An open-source GitHub repository contains necessary tools and components of this model. Privacy is respected through rigorous observance of confidentiality and data protection requirements. Critical reflection on these ethics and law aspects is included in the project. Conclusions: This model strengthens mental health surveillance in schools. The same tools and components could be applied to other public health data. Future extensions of this model could also incorporate unsupervised learning to find clusters and patterns of unknown effects.}, language = {en} } @article{AndresBruttelFriedrichsen2022, author = {Andres, Maximilian and Bruttel, Lisa and Friedrichsen, Jana}, title = {How communication makes the difference between a cartel and tacit collusion}, series = {European economic review}, volume = {152}, journal = {European economic review}, publisher = {Elsevier}, address = {Amsterdam}, issn = {0014-2921}, doi = {10.1016/j.euroecorev.2022.104331}, pages = {1 -- 18}, year = {2022}, abstract = {This paper sheds new light on the role of communication for cartel formation. Using machine learning to evaluate free-form chat communication among firms in a laboratory experiment, we identify typical communication patterns for both explicit cartel formation and indirect attempts to collude tacitly. We document that firms are less likely to communicate explicitly about price fixing and more likely to use indirect messages when sanctioning institutions are present. This effect of sanctions on communication reinforces the direct cartel-deterring effect of sanctions as collusion is more difficult to reach and sustain without an explicit agreement. Indirect messages have no, or even a negative, effect on prices.}, language = {en} } @article{Ayzel2021, author = {Ayzel, Georgy}, title = {Deep neural networks in hydrology}, series = {Vestnik of Saint Petersburg University. Earth Sciences}, volume = {66}, journal = {Vestnik of Saint Petersburg University. Earth Sciences}, number = {1}, publisher = {Univ. Press}, address = {St. Petersburg}, issn = {2541-9668}, doi = {10.21638/spbu07.2021.101}, pages = {5 -- 18}, year = {2021}, abstract = {For around a decade, deep learning - the sub-field of machine learning that refers to artificial neural networks comprised of many computational layers - modifies the landscape of statistical model development in many research areas, such as image classification, machine translation, and speech recognition. Geoscientific disciplines in general and the field of hydrology in particular, also do not stand aside from this movement. Recently, the proliferation of modern deep learning-based techniques and methods has been actively gaining popularity for solving a wide range of hydrological problems: modeling and forecasting of river runoff, hydrological model parameters regionalization, assessment of available water resources. identification of the main drivers of the recent change in water balance components. This growing popularity of deep neural networks is primarily due to their high universality and efficiency. The presented qualities, together with the rapidly growing amount of accumulated environmental information, as well as increasing availability of computing facilities and resources, allow us to speak about deep neural networks as a new generation of mathematical models designed to, if not to replace existing solutions, but significantly enrich the field of geophysical processes modeling. This paper provides a brief overview of the current state of the field of development and application of deep neural networks in hydrology. Also in the following study, the qualitative long-term forecast regarding the development of deep learning technology for managing the corresponding hydrological modeling challenges is provided based on the use of "Gartner Hype Curve", which in the general details describes a life cycle of modern technologies.}, language = {en} } @article{AyzelIzhitskiy2019, author = {Ayzel, Georgy and Izhitskiy, Alexander}, title = {Climate Change Impact Assessment on Freshwater Inflow into the Small Aral Sea}, series = {Water}, volume = {11}, journal = {Water}, number = {11}, publisher = {MDPI}, address = {Basel}, issn = {2073-4441}, doi = {10.3390/w11112377}, pages = {19}, year = {2019}, abstract = {During the last few decades, the rapid separation of the Small Aral Sea from the isolated basin has changed its hydrological and ecological conditions tremendously. In the present study, we developed and validated the hybrid model for the Syr Darya River basin based on a combination of state-of-the-art hydrological and machine learning models. Climate change impact on freshwater inflow into the Small Aral Sea for the projection period 2007-2099 has been quantified based on the developed hybrid model and bias corrected and downscaled meteorological projections simulated by four General Circulation Models (GCM) for each of three Representative Concentration Pathway scenarios (RCP). The developed hybrid model reliably simulates freshwater inflow for the historical period with a Nash-Sutcliffe efficiency of 0.72 and a Kling-Gupta efficiency of 0.77. Results of the climate change impact assessment showed that the freshwater inflow projections produced by different GCMs are misleading by providing contradictory results for the projection period. However, we identified that the relative runoff changes are expected to be more pronounced in the case of more aggressive RCP scenarios. The simulated projections of freshwater inflow provide a basis for further assessment of climate change impacts on hydrological and ecological conditions of the Small Aral Sea in the 21st Century.}, language = {en} } @article{BaumgartBoosEckstein2023, author = {Baumgart, Lene and Boos, Pauline and Eckstein, Bernd}, title = {Datafication and algorithmic contingency}, series = {Work organisation, labour \& globalisation}, volume = {17}, journal = {Work organisation, labour \& globalisation}, number = {1}, publisher = {Pluto Journals}, address = {London}, issn = {1745-641X}, doi = {10.13169/workorgalaboglob.17.1.0061}, pages = {61 -- 73}, year = {2023}, abstract = {In the context of persistent images of self-perpetuated technologies, we discuss the interplay of digital technologies and organisational dynamics against the backdrop of systems theory. Building on the case of an international corporation that, during an agile reorganisation, introduced an AI-based personnel management platform, we show how technical systems produce a form of algorithmic contingency that subsequently leads to the emergence of formal and informal interaction systems. Using the concept of datafication, we explain how these interactions are barriers to the self-perpetuation of data-based decision-making, making it possible to take into consideration further decision factors and complementing the output of the platform. The research was carried out within the scope of the research project 'Organisational Implications of Digitalisation: The Development of (Post-)Bureaucratic Organisational Structures in the Context of Digital Transformation' funded by the German Research Foundation (DFG).}, language = {en} } @article{BornhorstNustedeFudickar2019, author = {Bornhorst, Julia and Nustede, Eike Jannik and Fudickar, Sebastian}, title = {Mass Surveilance of C. elegans-Smartphone-Based DIY Microscope and Machine-Learning-Based Approach for Worm Detection}, series = {Sensors}, volume = {19}, journal = {Sensors}, number = {6}, publisher = {MDPI}, address = {Basel}, issn = {1424-8220}, doi = {10.3390/s19061468}, pages = {14}, year = {2019}, abstract = {The nematode Caenorhabditis elegans (C. elegans) is often used as an alternative animal model due to several advantages such as morphological changes that can be seen directly under a microscope. Limitations of the model include the usage of expensive and cumbersome microscopes, and restrictions of the comprehensive use of C. elegans for toxicological trials. With the general applicability of the detection of C. elegans from microscope images via machine learning, as well as of smartphone-based microscopes, this article investigates the suitability of smartphone-based microscopy to detect C. elegans in a complete Petri dish. Thereby, the article introduces a smartphone-based microscope (including optics, lighting, and housing) for monitoring C. elegans and the corresponding classification via a trained Histogram of Oriented Gradients (HOG) feature-based Support Vector Machine for the automatic detection of C. elegans. Evaluation showed classification sensitivity of 0.90 and specificity of 0.85, and thereby confirms the general practicability of the chosen approach.}, language = {en} } @article{BrandesSicksBerger2021, author = {Brandes, Stefanie and Sicks, Florian and Berger, Anne}, title = {Behaviour classification on giraffes (Giraffa camelopardalis) using machine learning algorithms on triaxial acceleration data of two commonly used GPS devices and its possible application for their management and conservation}, series = {Sensors}, volume = {21}, journal = {Sensors}, number = {6}, publisher = {MDPI}, address = {Basel}, issn = {1424-8220}, doi = {10.3390/s21062229}, pages = {22}, year = {2021}, abstract = {Averting today's loss of biodiversity and ecosystem services can be achieved through conservation efforts, especially of keystone species. Giraffes (Giraffa camelopardalis) play an important role in sustaining Africa's ecosystems, but are 'vulnerable' according to the IUCN Red List since 2016. Monitoring an animal's behavior in the wild helps to develop and assess their conservation management. One mechanism for remote tracking of wildlife behavior is to attach accelerometers to animals to record their body movement. We tested two different commercially available high-resolution accelerometers, e-obs and Africa Wildlife Tracking (AWT), attached to the top of the heads of three captive giraffes and analyzed the accuracy of automatic behavior classifications, focused on the Random Forests algorithm. For both accelerometers, behaviors of lower variety in head and neck movements could be better predicted (i.e., feeding above eye level, mean prediction accuracy e-obs/AWT: 97.6\%/99.7\%; drinking: 96.7\%/97.0\%) than those with a higher variety of body postures (such as standing: 90.7-91.0\%/75.2-76.7\%; rumination: 89.6-91.6\%/53.5-86.5\%). Nonetheless both devices come with limitations and especially the AWT needs technological adaptations before applying it on animals in the wild. Nevertheless, looking at the prediction results, both are promising accelerometers for behavioral classification of giraffes. Therefore, these devices when applied to free-ranging animals, in combination with GPS tracking, can contribute greatly to the conservation of giraffes.}, language = {en} } @article{CeulemansGuillGaedke2021, author = {Ceulemans, Ruben and Guill, Christian and Gaedke, Ursula}, title = {Top predators govern multitrophic diversity effects in tritrophic food webs}, series = {Ecology : a publication of the Ecological Society of America}, volume = {102}, journal = {Ecology : a publication of the Ecological Society of America}, number = {7}, publisher = {Wiley}, address = {Hoboken}, issn = {0012-9658}, doi = {10.1002/ecy.3379}, pages = {16}, year = {2021}, abstract = {It is well known that functional diversity strongly affects ecosystem functioning. However, even in rather simple model communities consisting of only two or, at best, three trophic levels, the relationship between multitrophic functional diversity and ecosystem functioning appears difficult to generalize, because of its high contextuality. In this study, we considered several differently structured tritrophic food webs, in which the amount of functional diversity was varied independently on each trophic level. To achieve generalizable results, largely independent of parametrization, we examined the outcomes of 128,000 parameter combinations sampled from ecologically plausible intervals, with each tested for 200 randomly sampled initial conditions. Analysis of our data was done by training a random forest model. This method enables the identification of complex patterns in the data through partial dependence graphs, and the comparison of the relative influence of model parameters, including the degree of diversity, on food-web properties. We found that bottom-up and top-down effects cascade simultaneously throughout the food web, intimately linking the effects of functional diversity of any trophic level to the amount of diversity of other trophic levels, which may explain the difficulty in unifying results from previous studies. Strikingly, only with high diversity throughout the whole food web, different interactions synergize to ensure efficient exploitation of the available nutrients and efficient biomass transfer to higher trophic levels, ultimately leading to a high biomass and production on the top level. The temporal variation of biomass showed a more complex pattern with increasing multitrophic diversity: while the system initially became less variable, eventually the temporal variation rose again because of the increasingly complex dynamical patterns. Importantly, top predator diversity and food-web parameters affecting the top trophic level were of highest importance to determine the biomass and temporal variability of any trophic level. Overall, our study reveals that the mechanisms by which diversity influences ecosystem functioning are affected by every part of the food web, hampering the extrapolation of insights from simple monotrophic or bitrophic systems to complex natural food webs.}, language = {en} } @article{ChenLangeAndjelkovicetal.2022, author = {Chen, Junchao and Lange, Thomas and Andjelkovic, Marko and Simevski, Aleksandar and Lu, Li and Krstić, Miloš}, title = {Solar particle event and single event upset prediction from SRAM-based monitor and supervised machine learning}, series = {IEEE transactions on emerging topics in computing / IEEE Computer Society, Institute of Electrical and Electronics Engineers}, volume = {10}, journal = {IEEE transactions on emerging topics in computing / IEEE Computer Society, Institute of Electrical and Electronics Engineers}, number = {2}, publisher = {Institute of Electrical and Electronics Engineers}, address = {[New York, NY]}, issn = {2168-6750}, doi = {10.1109/TETC.2022.3147376}, pages = {564 -- 580}, year = {2022}, abstract = {The intensity of cosmic radiation may differ over five orders of magnitude within a few hours or days during the Solar Particle Events (SPEs), thus increasing for several orders of magnitude the probability of Single Event Upsets (SEUs) in space-borne electronic systems. Therefore, it is vital to enable the early detection of the SEU rate changes in order to ensure timely activation of dynamic radiation hardening measures. In this paper, an embedded approach for the prediction of SPEs and SRAM SEU rate is presented. The proposed solution combines the real-time SRAM-based SEU monitor, the offline-trained machine learning model and online learning algorithm for the prediction. With respect to the state-of-the-art, our solution brings the following benefits: (1) Use of existing on-chip data storage SRAM as a particle detector, thus minimizing the hardware and power overhead, (2) Prediction of SRAM SEU rate one hour in advance, with the fine-grained hourly tracking of SEU variations during SPEs as well as under normal conditions, (3) Online optimization of the prediction model for enhancing the prediction accuracy during run-time, (4) Negligible cost of hardware accelerator design for the implementation of selected machine learning model and online learning algorithm. The proposed design is intended for a highly dependable and self-adaptive multiprocessing system employed in space applications, allowing to trigger the radiation mitigation mechanisms before the onset of high radiation levels.}, language = {en} } @article{CopeBaukmannKlingeretal.2021, author = {Cope, Justin L. and Baukmann, Hannes A. and Klinger, J{\"o}rn E. and Ravarani, Charles N. J. and B{\"o}ttinger, Erwin and Konigorski, Stefan and Schmidt, Marco F.}, title = {Interaction-based feature selection algorithm outperforms polygenic risk score in predicting Parkinson's Disease status}, series = {Frontiers in genetics}, volume = {12}, journal = {Frontiers in genetics}, publisher = {Frontiers Media}, address = {Lausanne}, issn = {1664-8021}, doi = {10.3389/fgene.2021.744557}, pages = {9}, year = {2021}, abstract = {Polygenic risk scores (PRS) aggregating results from genome-wide association studies are the state of the art in the prediction of susceptibility to complex traits or diseases, yet their predictive performance is limited for various reasons, not least of which is their failure to incorporate the effects of gene-gene interactions. Novel machine learning algorithms that use large amounts of data promise to find gene-gene interactions in order to build models with better predictive performance than PRS. Here, we present a data preprocessing step by using data-mining of contextual information to reduce the number of features, enabling machine learning algorithms to identify gene-gene interactions. We applied our approach to the Parkinson's Progression Markers Initiative (PPMI) dataset, an observational clinical study of 471 genotyped subjects (368 cases and 152 controls). With an AUC of 0.85 (95\% CI = [0.72; 0.96]), the interaction-based prediction model outperforms the PRS (AUC of 0.58 (95\% CI = [0.42; 0.81])). Furthermore, feature importance analysis of the model provided insights into the mechanism of Parkinson's disease. For instance, the model revealed an interaction of previously described drug target candidate genes TMEM175 and GAPDHP25. These results demonstrate that interaction-based machine learning models can improve genetic prediction models and might provide an answer to the missing heritability problem.}, language = {en} } @article{Doellner2020, author = {D{\"o}llner, J{\"u}rgen Roland Friedrich}, title = {Geospatial artificial intelligence}, series = {Journal of photogrammetry, remote sensing and geoinformation science : PFG : Photogrammetrie, Fernerkundung, Geoinformation}, volume = {88}, journal = {Journal of photogrammetry, remote sensing and geoinformation science : PFG : Photogrammetrie, Fernerkundung, Geoinformation}, number = {1}, publisher = {Springer International Publishing}, address = {Cham}, issn = {2512-2789}, doi = {10.1007/s41064-020-00102-3}, pages = {15 -- 24}, year = {2020}, abstract = {Artificial intelligence (AI) is changing fundamentally the way how IT solutions are implemented and operated across all application domains, including the geospatial domain. This contribution outlines AI-based techniques for 3D point clouds and geospatial digital twins as generic components of geospatial AI. First, we briefly reflect on the term "AI" and outline technology developments needed to apply AI to IT solutions, seen from a software engineering perspective. Next, we characterize 3D point clouds as key category of geodata and their role for creating the basis for geospatial digital twins; we explain the feasibility of machine learning (ML) and deep learning (DL) approaches for 3D point clouds. In particular, we argue that 3D point clouds can be seen as a corpus with similar properties as natural language corpora and formulate a "Naturalness Hypothesis" for 3D point clouds. In the main part, we introduce a workflow for interpreting 3D point clouds based on ML/DL approaches that derive domain-specific and application-specific semantics for 3D point clouds without having to create explicit spatial 3D models or explicit rule sets. Finally, examples are shown how ML/DL enables us to efficiently build and maintain base data for geospatial digital twins such as virtual 3D city models, indoor models, or building information models.}, language = {en} } @article{EbersHochRosenkranzetal.2021, author = {Ebers, Martin and Hoch, Veronica R. S. and Rosenkranz, Frank and Ruschemeier, Hannah and Steinr{\"o}tter, Bj{\"o}rn}, title = {The European Commission's proposal for an Artificial Intelligence Act}, series = {J : multidisciplinary scientific journal}, volume = {4}, journal = {J : multidisciplinary scientific journal}, number = {4}, publisher = {MDPI}, address = {Basel}, issn = {2571-8800}, doi = {10.3390/j4040043}, pages = {589 -- 603}, year = {2021}, abstract = {On 21 April 2021, the European Commission presented its long-awaited proposal for a Regulation "laying down harmonized rules on Artificial Intelligence", the so-called "Artificial Intelligence Act" (AIA). This article takes a critical look at the proposed regulation. After an introduction (1), the paper analyzes the unclear preemptive effect of the AIA and EU competences (2), the scope of application (3), the prohibited uses of Artificial Intelligence (AI) (4), the provisions on high-risk AI systems (5), the obligations of providers and users (6), the requirements for AI systems with limited risks (7), the enforcement system (8), the relationship of the AIA with the existing legal framework (9), and the regulatory gaps (10). The last section draws some final conclusions (11).}, language = {en} } @article{FrommholdHeimBarabanovetal.2019, author = {Frommhold, Martin and Heim, Arend and Barabanov, Mikhail and Maier, Franziska and M{\"u}hle, Ralf-Udo and Smirenski, Sergei M. and Heim, Wieland}, title = {Breeding habitat and nest-site selection by an obligatory "nest-cleptoparasite", the Amur Falcon Falco amurensis}, series = {Ecology and evolution}, volume = {9}, journal = {Ecology and evolution}, number = {24}, publisher = {Wiley}, address = {Hoboken}, issn = {2045-7758}, doi = {10.1002/ece3.5878}, pages = {14430 -- 14441}, year = {2019}, abstract = {The selection of a nest site is crucial for successful reproduction of birds. Animals which re-use or occupy nest sites constructed by other species often have limited choice. Little is known about the criteria of nest-stealing species to choose suitable nesting sites and habitats. Here, we analyze breeding-site selection of an obligatory "nest-cleptoparasite", the Amur Falcon Falco amurensis. We collected data on nest sites at Muraviovka Park in the Russian Far East, where the species breeds exclusively in nests of the Eurasian Magpie Pica pica. We sampled 117 Eurasian Magpie nests, 38 of which were occupied by Amur Falcons. Nest-specific variables were assessed, and a recently developed habitat classification map was used to derive landscape metrics. We found that Amur Falcons chose a wide range of nesting sites, but significantly preferred nests with a domed roof. Breeding pairs of Eurasian Hobby Falco subbuteo and Eurasian Magpie were often found to breed near the nest in about the same distance as neighboring Amur Falcon pairs. Additionally, the occurrence of the species was positively associated with bare soil cover, forest cover, and shrub patches within their home range and negatively with the distance to wetlands. Areas of wetlands and fallow land might be used for foraging since Amur Falcons mostly depend on an insect diet. Additionally, we found that rarely burned habitats were preferred. Overall, the effect of landscape variables on the choice of actual nest sites appeared to be rather small. We used different classification methods to predict the probability of occurrence, of which the Random forest method showed the highest accuracy. The areas determined as suitable habitat showed a high concordance with the actual nest locations. We conclude that Amur Falcons prefer to occupy newly built (domed) nests to ensure high nest quality, as well as nests surrounded by available feeding habitats.}, language = {en} } @article{GhafarianWielandLuettschwageretal.2022, author = {Ghafarian, Fatemeh and Wieland, Ralf and L{\"u}ttschwager, Dietmar and Nendel, Claas}, title = {Application of extreme gradient boosting and Shapley Additive explanations to predict temperature regimes inside forests from standard open-field meteorological data}, series = {Environmental modelling \& software with environment data news}, volume = {156}, journal = {Environmental modelling \& software with environment data news}, publisher = {Elsevier}, address = {Oxford}, issn = {1364-8152}, doi = {10.1016/j.envsoft.2022.105466}, pages = {11}, year = {2022}, abstract = {Forest microclimate can buffer biotic responses to summer heat waves, which are expected to become more extreme under climate warming. Prediction of forest microclimate is limited because meteorological observation standards seldom include situations inside forests. We use eXtreme Gradient Boosting - a Machine Learning technique - to predict the microclimate of forest sites in Brandenburg, Germany, using seasonal data comprising weather features. The analysis was amended by applying a SHapley Additive explanation to show the interaction effect of variables and individualised feature attributions. We evaluate model performance in comparison to artificial neural networks, random forest, support vector machine, and multi-linear regression. After implementing a feature selection, an ensemble approach was applied to combine individual models for each forest and improve robustness over a given single prediction model. The resulting model can be applied to translate climate change scenarios into temperatures inside forests to assess temperature-related ecosystem services provided by forests.}, language = {en} } @article{HampfNendelStreyetal.2021, author = {Hampf, Anna and Nendel, Claas and Strey, Simone and Strey, Robert}, title = {Biotic yield losses in the Southern Amazon, Brazil}, series = {Frontiers in plant science : FPLS}, volume = {12}, journal = {Frontiers in plant science : FPLS}, publisher = {Frontiers Media}, address = {Lausanne}, issn = {1664-462X}, doi = {10.3389/fpls.2021.621168}, pages = {16}, year = {2021}, abstract = {Pathogens and animal pests (P\&A) are a major threat to global food security as they directly affect the quantity and quality of food. The Southern Amazon, Brazil's largest domestic region for soybean, maize and cotton production, is particularly vulnerable to the outbreak of P\&A due to its (sub)tropical climate and intensive farming systems. However, little is known about the spatial distribution of P\&A and the related yield losses. Machine learning approaches for the automated recognition of plant diseases can help to overcome this research gap. The main objectives of this study are to (1) evaluate the performance of Convolutional Neural Networks (ConvNets) in classifying P\&A, (2) map the spatial distribution of P\&A in the Southern Amazon, and (3) quantify perceived yield and economic losses for the main soybean and maize P\&A. The objectives were addressed by making use of data collected with the smartphone application Plantix. The core of the app's functioning is the automated recognition of plant diseases via ConvNets. Data on expected yield losses were gathered through a short survey included in an "expert" version of the application, which was distributed among agronomists. Between 2016 and 2020, Plantix users collected approximately 78,000 georeferenced P\&A images in the Southern Amazon. The study results indicate a high performance of the trained ConvNets in classifying 420 different crop-disease combinations. Spatial distribution maps and expert-based yield loss estimates indicate that maize rust, bacterial stalk rot and the fall armyworm are among the most severe maize P\&A, whereas soybean is mainly affected by P\&A like anthracnose, downy mildew, frogeye leaf spot, stink bugs and brown spot. Perceived soybean and maize yield losses amount to 12 and 16\%, respectively, resulting in annual yield losses of approximately 3.75 million tonnes for each crop and economic losses of US\$2 billion for both crops together. The high level of accuracy of the trained ConvNets, when paired with widespread use from following a citizen-science approach, results in a data source that will shed new light on yield loss estimates, e.g., for the analysis of yield gaps and the development of measures to minimise them.}, language = {en} } @article{HeckerSteckhanEybenetal.2022, author = {Hecker, Pascal and Steckhan, Nico and Eyben, Florian and Schuller, Bj{\"o}rn Wolfgang and Arnrich, Bert}, title = {Voice Analysis for Neurological Disorder Recognition - A Systematic Review and Perspective on Emerging Trends}, series = {Frontiers in Digital Health}, journal = {Frontiers in Digital Health}, publisher = {Frontiers Media SA}, address = {Lausanne, Schweiz}, issn = {2673-253X}, doi = {10.3389/fdgth.2022.842301}, pages = {16}, year = {2022}, abstract = {Quantifying neurological disorders from voice is a rapidly growing field of research and holds promise for unobtrusive and large-scale disorder monitoring. The data recording setup and data analysis pipelines are both crucial aspects to effectively obtain relevant information from participants. Therefore, we performed a systematic review to provide a high-level overview of practices across various neurological disorders and highlight emerging trends. PRISMA-based literature searches were conducted through PubMed, Web of Science, and IEEE Xplore to identify publications in which original (i.e., newly recorded) datasets were collected. Disorders of interest were psychiatric as well as neurodegenerative disorders, such as bipolar disorder, depression, and stress, as well as amyotrophic lateral sclerosis amyotrophic lateral sclerosis, Alzheimer's, and Parkinson's disease, and speech impairments (aphasia, dysarthria, and dysphonia). Of the 43 retrieved studies, Parkinson's disease is represented most prominently with 19 discovered datasets. Free speech and read speech tasks are most commonly used across disorders. Besides popular feature extraction toolkits, many studies utilise custom-built feature sets. Correlations of acoustic features with psychiatric and neurodegenerative disorders are presented. In terms of analysis, statistical analysis for significance of individual features is commonly used, as well as predictive modeling approaches, especially with support vector machines and a small number of artificial neural networks. An emerging trend and recommendation for future studies is to collect data in everyday life to facilitate longitudinal data collection and to capture the behavior of participants more naturally. Another emerging trend is to record additional modalities to voice, which can potentially increase analytical performance.}, language = {en} } @article{KibrikKhudyakovaDobrovetal.2016, author = {Kibrik, Andrej A. and Khudyakova, Mariya V. and Dobrov, Grigory B. and Linnik, Anastasia and Zalmanov, Dmitrij A.}, title = {Referential Choice}, series = {Frontiers in psychology}, volume = {7}, journal = {Frontiers in psychology}, publisher = {Frontiers Research Foundation}, address = {Lausanne}, issn = {1664-1078}, doi = {10.3389/fpsyg.2016.01429}, year = {2016}, abstract = {We report a study of referential choice in discourse production, understood as the choice between various types of referential devices, such as pronouns and full noun phrases. Our goal is to predict referential choice, and to explore to what extent such prediction is possible. Our approach to referential choice includes a cognitively informed theoretical component, corpus analysis, machine learning methods and experimentation with human participants. Machine learning algorithms make use of 25 factors, including referent's properties (such as animacy and protagonism), the distance between a referential expression and its antecedent, the antecedent's syntactic role, and so on. Having found the predictions of our algorithm to coincide with the original almost 90\% of the time, we hypothesized that fully accurate prediction is not possible because, in many situations, more than one referential option is available. This hypothesis was supported by an experimental study, in which participants answered questions about either the original text in the corpus, or about a text modified in accordance with the algorithm's prediction. Proportions of correct answers to these questions, as well as participants' rating of the questions' difficulty, suggested that divergences between the algorithm's prediction and the original referential device in the corpus occur overwhelmingly in situations where the referential choice is not categorical.}, language = {en} } @article{KibrikKhudyakovaDobrovetal.2016, author = {Kibrik, Andrej A. and Khudyakova, Mariya V. and Dobrov, Grigory B. and Linnik, Anastasia and Zalmanov, Dmitrij A.}, title = {Referential Choice: Predictability and Its Limits}, series = {Frontiers in psychology}, volume = {7}, journal = {Frontiers in psychology}, publisher = {Frontiers Research Foundation}, address = {Lausanne}, issn = {1664-1078}, doi = {10.3389/fpsyg.2016.01429}, pages = {9939 -- 9947}, year = {2016}, abstract = {We report a study of referential choice in discourse production, understood as the choice between various types of referential devices, such as pronouns and full noun phrases. Our goal is to predict referential choice, and to explore to what extent such prediction is possible. Our approach to referential choice includes a cognitively informed theoretical component, corpus analysis, machine learning methods and experimentation with human participants. Machine learning algorithms make use of 25 factors, including referent's properties (such as animacy and protagonism), the distance between a referential expression and its antecedent, the antecedent's syntactic role, and so on. Having found the predictions of our algorithm to coincide with the original almost 90\% of the time, we hypothesized that fully accurate prediction is not possible because, in many situations, more than one referential option is available. This hypothesis was supported by an experimental study, in which participants answered questions about either the original text in the corpus, or about a text modified in accordance with the algorithm's prediction. Proportions of correct answers to these questions, as well as participants' rating of the questions' difficulty, suggested that divergences between the algorithm's prediction and the original referential device in the corpus occur overwhelmingly in situations where the referential choice is not categorical.}, language = {en} } @article{KonakWegnerArnrich2020, author = {Konak, Orhan and Wegner, Pit and Arnrich, Bert}, title = {IMU-Based Movement Trajectory Heatmaps for Human Activity Recognition}, series = {Sensors}, volume = {20}, journal = {Sensors}, number = {24}, publisher = {MDPI}, address = {Basel}, issn = {1424-8220}, doi = {10.3390/s20247179}, pages = {15}, year = {2020}, abstract = {Recent trends in ubiquitous computing have led to a proliferation of studies that focus on human activity recognition (HAR) utilizing inertial sensor data that consist of acceleration, orientation and angular velocity. However, the performances of such approaches are limited by the amount of annotated training data, especially in fields where annotating data is highly time-consuming and requires specialized professionals, such as in healthcare. In image classification, this limitation has been mitigated by powerful oversampling techniques such as data augmentation. Using this technique, this work evaluates to what extent transforming inertial sensor data into movement trajectories and into 2D heatmap images can be advantageous for HAR when data are scarce. A convolutional long short-term memory (ConvLSTM) network that incorporates spatiotemporal correlations was used to classify the heatmap images. Evaluation was carried out on Deep Inertial Poser (DIP), a known dataset composed of inertial sensor data. The results obtained suggest that for datasets with large numbers of subjects, using state-of-the-art methods remains the best alternative. However, a performance advantage was achieved for small datasets, which is usually the case in healthcare. Moreover, movement trajectories provide a visual representation of human activities, which can help researchers to better interpret and analyze motion patterns.}, language = {en} } @article{KuehnHainzlDahmetal.2022, author = {K{\"u}hn, Daniela and Hainzl, Sebastian and Dahm, Torsten and Richter, Gudrun and Vera Rodriguez, Ismael}, title = {A review of source models to further the understanding of the seismicity of the Groningen field}, series = {Netherlands journal of geosciences : NJG}, volume = {101}, journal = {Netherlands journal of geosciences : NJG}, publisher = {Cambridge Univ. Press}, address = {Cambridge}, issn = {0016-7746}, doi = {10.1017/njg.2022.7}, pages = {12}, year = {2022}, abstract = {The occurrence of felt earthquakes due to gas production in Groningen has initiated numerous studies and model attempts to understand and quantify induced seismicity in this region. The whole bandwidth of available models spans the range from fully deterministic models to purely empirical and stochastic models. In this article, we summarise the most important model approaches, describing their main achievements and limitations. In addition, we discuss remaining open questions and potential future directions of development.}, language = {en} } @article{LevyMussackBrunneretal.2020, author = {Levy, Jessica and Mussack, Dominic and Brunner, Martin and Keller, Ulrich and Cardoso-Leite, Pedro and Fischbach, Antoine}, title = {Contrasting classical and machine learning approaches in the estimation of value-added scores in large-scale educational data}, series = {Frontiers in psychology}, volume = {11}, journal = {Frontiers in psychology}, publisher = {Frontiers Research Foundation}, address = {Lausanne}, issn = {1664-1078}, doi = {10.3389/fpsyg.2020.02190}, pages = {18}, year = {2020}, abstract = {There is no consensus on which statistical model estimates school value-added (VA) most accurately. To date, the two most common statistical models used for the calculation of VA scores are two classical methods: linear regression and multilevel models. These models have the advantage of being relatively transparent and thus understandable for most researchers and practitioners. However, these statistical models are bound to certain assumptions (e.g., linearity) that might limit their prediction accuracy. Machine learning methods, which have yielded spectacular results in numerous fields, may be a valuable alternative to these classical models. Although big data is not new in general, it is relatively new in the realm of social sciences and education. New types of data require new data analytical approaches. Such techniques have already evolved in fields with a long tradition in crunching big data (e.g., gene technology). The objective of the present paper is to competently apply these "imported" techniques to education data, more precisely VA scores, and assess when and how they can extend or replace the classical psychometrics toolbox. The different models include linear and non-linear methods and extend classical models with the most commonly used machine learning methods (i.e., random forest, neural networks, support vector machines, and boosting). We used representative data of 3,026 students in 153 schools who took part in the standardized achievement tests of the Luxembourg School Monitoring Program in grades 1 and 3. Multilevel models outperformed classical linear and polynomial regressions, as well as different machine learning models. However, it could be observed that across all schools, school VA scores from different model types correlated highly. Yet, the percentage of disagreements as compared to multilevel models was not trivial and real-life implications for individual schools may still be dramatic depending on the model type used. Implications of these results and possible ethical concerns regarding the use of machine learning methods for decision-making in education are discussed.}, language = {en} } @article{PanzerBenderGronau2022, author = {Panzer, Marcel and Bender, Benedict and Gronau, Norbert}, title = {Neural agent-based production planning and control}, series = {Journal of Manufacturing Systems}, volume = {65}, journal = {Journal of Manufacturing Systems}, publisher = {Elsevier}, address = {Amsterdam}, issn = {0278-6125}, doi = {10.1016/j.jmsy.2022.10.019}, pages = {743 -- 766}, year = {2022}, abstract = {Nowadays, production planning and control must cope with mass customization, increased fluctuations in demand, and high competition pressures. Despite prevailing market risks, planning accuracy and increased adaptability in the event of disruptions or failures must be ensured, while simultaneously optimizing key process indicators. To manage that complex task, neural networks that can process large quantities of high-dimensional data in real time have been widely adopted in recent years. Although these are already extensively deployed in production systems, a systematic review of applications and implemented agent embeddings and architectures has not yet been conducted. The main contribution of this paper is to provide researchers and practitioners with an overview of applications and applied embeddings and to motivate further research in neural agent-based production. Findings indicate that neural agents are not only deployed in diverse applications, but are also increasingly implemented in multi-agent environments or in combination with conventional methods — leveraging performances compared to benchmarks and reducing dependence on human experience. This not only implies a more sophisticated focus on distributed production resources, but also broadening the perspective from a local to a global scale. Nevertheless, future research must further increase scalability and reproducibility to guarantee a simplified transfer of results to reality.}, language = {en} } @article{RossoNendelGilardietal.2022, author = {Rosso, Pablo and Nendel, Claas and Gilardi, Nicolas and Udroiu, Cosmin and Chlebowski, Florent}, title = {Processing of remote sensing information to retrieve leaf area index in barley}, series = {Precision agriculture}, volume = {23}, journal = {Precision agriculture}, number = {4}, publisher = {Springer}, address = {Dordrecht}, issn = {1385-2256}, doi = {10.1007/s11119-022-09893-4}, pages = {1449 -- 1472}, year = {2022}, abstract = {Leaf area index (LAI) is a key variable in understanding and modeling crop-environment interactions. With the advent of increasingly higher spatial resolution satellites and sensors mounted on remotely piloted aircrafts (RPAs), the use of remote sensing in precision agriculture is becoming more common. Since also the availability of methods to retrieve LAI from image data have also drastically expanded, it is necessary to test simultaneously as many methods as possible to understand the advantages and disadvantages of each approach. Ground-based LAI data from three years of barley experiments were related to remote sensing information using vegetation indices (VI), machine learning (ML) and radiative transfer models (RTM), to assess the relative accuracy and efficacy of these methods. The optimized soil adjusted vegetation index and a modified version of the Weighted Difference Vegetation Index performed slightly better than any other retrieval method. However, all methods yielded coefficients of determination of around 0.7 to 0.9. The best performing machine learning algorithms achieved higher accuracies when four Sentinel-2 bands instead of 12 were used. Also, the good performance of VIs and the satisfactory performance of the 4-band RTM, strongly support the synergistic use of satellites and RPAs in precision agriculture. One of the methods used, Sen2-Agri, an open source ML-RTM-based operational system, was also able to accurately retrieve LAI, although it is restricted to Sentinel-2 and Landsat data. This study shows the benefits of testing simultaneously a broad range of retrieval methods to monitor crops for precision agriculture.}, language = {en} } @article{RyoJeschkeRilligetal.2020, author = {Ryo, Masahiro and Jeschke, Jonathan M. and Rillig, Matthias C. and Heger, Tina}, title = {Machine learning with the hierarchy-of-hypotheses (HoH) approach discovers novel pattern in studies on biological invasions}, series = {Research synthesis methods}, volume = {11}, journal = {Research synthesis methods}, number = {1}, publisher = {Wiley}, address = {Hoboken}, issn = {1759-2879}, doi = {10.1002/jrsm.1363}, pages = {66 -- 73}, year = {2020}, abstract = {Research synthesis on simple yet general hypotheses and ideas is challenging in scientific disciplines studying highly context-dependent systems such as medical, social, and biological sciences. This study shows that machine learning, equation-free statistical modeling of artificial intelligence, is a promising synthesis tool for discovering novel patterns and the source of controversy in a general hypothesis. We apply a decision tree algorithm, assuming that evidence from various contexts can be adequately integrated in a hierarchically nested structure. As a case study, we analyzed 163 articles that studied a prominent hypothesis in invasion biology, the enemy release hypothesis. We explored if any of the nine attributes that classify each study can differentiate conclusions as classification problem. Results corroborated that machine learning can be useful for research synthesis, as the algorithm could detect patterns that had been already focused in previous narrative reviews. Compared with the previous synthesis study that assessed the same evidence collection based on experts' judgement, the algorithm has newly proposed that the studies focusing on Asian regions mostly supported the hypothesis, suggesting that more detailed investigations in these regions can enhance our understanding of the hypothesis. We suggest that machine learning algorithms can be a promising synthesis tool especially where studies (a) reformulate a general hypothesis from different perspectives, (b) use different methods or variables, or (c) report insufficient information for conducting meta-analyses.}, language = {en} } @article{SchmidtHesseAttingeretal.2020, author = {Schmidt, Lennart and Hesse, Falk and Attinger, Sabine and Kumar, Rohini}, title = {Challenges in applying machine learning models for hydrological inference}, series = {Water resources research}, volume = {56}, journal = {Water resources research}, number = {5}, publisher = {American Geophysical Union}, address = {Washington}, issn = {0043-1397}, doi = {10.1029/2019WR025924}, pages = {10}, year = {2020}, abstract = {Machine learning (ML) algorithms are being increasingly used in Earth and Environmental modeling studies owing to the ever-increasing availability of diverse data sets and computational resources as well as advancement in ML algorithms. Despite advances in their predictive accuracy, the usefulness of ML algorithms for inference remains elusive. In this study, we employ two popular ML algorithms, artificial neural networks and random forest, to analyze a large data set of flood events across Germany with the goals to analyze their predictive accuracy and their usability to provide insights to hydrologic system functioning. The results of the ML algorithms are contrasted against a parametric approach based on multiple linear regression. For analysis, we employ a model-agnostic framework named Permuted Feature Importance to derive the influence of models' predictors. This allows us to compare the results of different algorithms for the first time in the context of hydrology. Our main findings are that (1) the ML models achieve higher prediction accuracy than linear regression, (2) the results reflect basic hydrological principles, but (3) further inference is hindered by the heterogeneity of results across algorithms. Thus, we conclude that the problem of equifinality as known from classical hydrological modeling also exists for ML and severely hampers its potential for inference. To account for the observed problems, we propose that when employing ML for inference, this should be made by using multiple algorithms and multiple methods, of which the latter should be embedded in a cross-validation routine.}, language = {en} } @article{SchmidtHesseAttingeretal.2020, author = {Schmidt, Lennart and Heße, Falk and Attinger, Sabine and Kumar, Rohini}, title = {Challenges in applying machine learning models for hydrological inference: a case study for flooding events across Germany}, series = {Water Resources Research}, volume = {56}, journal = {Water Resources Research}, number = {5}, publisher = {John Wiley \& Sons, Inc.}, address = {New Jersey}, pages = {10}, year = {2020}, abstract = {Machine learning (ML) algorithms are being increasingly used in Earth and Environmental modeling studies owing to the ever-increasing availability of diverse data sets and computational resources as well as advancement in ML algorithms. Despite advances in their predictive accuracy, the usefulness of ML algorithms for inference remains elusive. In this study, we employ two popular ML algorithms, artificial neural networks and random forest, to analyze a large data set of flood events across Germany with the goals to analyze their predictive accuracy and their usability to provide insights to hydrologic system functioning. The results of the ML algorithms are contrasted against a parametric approach based on multiple linear regression. For analysis, we employ a model-agnostic framework named Permuted Feature Importance to derive the influence of models' predictors. This allows us to compare the results of different algorithms for the first time in the context of hydrology. Our main findings are that (1) the ML models achieve higher prediction accuracy than linear regression, (2) the results reflect basic hydrological principles, but (3) further inference is hindered by the heterogeneity of results across algorithms. Thus, we conclude that the problem of equifinality as known from classical hydrological modeling also exists for ML and severely hampers its potential for inference. To account for the observed problems, we propose that when employing ML for inference, this should be made by using multiple algorithms and multiple methods, of which the latter should be embedded in a cross-validation routine.}, language = {en} } @article{SchudomaLarhlimiWalther2011, author = {Schudoma, Christian and Larhlimi, Abdelhalim and Walther, Dirk}, title = {The influence of the local sequence environment on RNA loop structures}, series = {RNA : a publication of the RNA Society}, volume = {17}, journal = {RNA : a publication of the RNA Society}, number = {7}, publisher = {Cold Spring Harbor Laboratory Press}, address = {Cold Spring Harbor, NY}, issn = {1355-8382}, doi = {10.1261/rna.2550211}, pages = {1247 -- 1257}, year = {2011}, abstract = {RNA folding is assumed to be a hierarchical process. The secondary structure of an RNA molecule, signified by base-pairing and stacking interactions between the paired bases, is formed first. Subsequently, the RNA molecule adopts an energetically favorable three-dimensional conformation in the structural space determined mainly by the rotational degrees of freedom associated with the backbone of regions of unpaired nucleotides (loops). To what extent the backbone conformation of RNA loops also results from interactions within the local sequence context or rather follows global optimization constraints alone has not been addressed yet. Because the majority of base stacking interactions are exerted locally, a critical influence of local sequence on local structure appears plausible. Thus, local loop structure ought to be predictable, at least in part, from the local sequence context alone. To test this hypothesis, we used Random Forests on a nonredundant data set of unpaired nucleotides extracted from 97 X-ray structures from the Protein Data Bank (PDB) to predict discrete backbone angle conformations given by the discretized eta/theta-pseudo-torsional space. Predictions on balanced sets with four to six conformational classes using local sequence information yielded average accuracies of up to 55\%, thus significantly better than expected by chance (17\%-25\%). Bases close to the central nucleotide appear to be most tightly linked to its conformation. Our results suggest that RNA loop structure does not only depend on long-range base-pairing interactions; instead, it appears that local sequence context exerts a significant influence on the formation of the local loop structure.}, language = {en} } @article{SeewannVerwiebeBuderetal.2022, author = {Seewann, Lena and Verwiebe, Roland and Buder, Claudia and Fritsch, Nina-Sophie}, title = {"Broadcast your gender."}, series = {Frontiers in Big Data}, journal = {Frontiers in Big Data}, number = {5}, publisher = {Frontiers}, address = {Lausanne, Schweiz}, issn = {2624-909X}, doi = {10.3389/fdata.2022.908636}, pages = {16}, year = {2022}, abstract = {Social media platforms provide a large array of behavioral data relevant to social scientific research. However, key information such as sociodemographic characteristics of agents are often missing. This paper aims to compare four methods of classifying social attributes from text. Specifically, we are interested in estimating the gender of German social media creators. By using the example of a random sample of 200 YouTube channels, we compare several classification methods, namely (1) a survey among university staff, (2) a name dictionary method with the World Gender Name Dictionary as a reference list, (3) an algorithmic approach using the website gender-api.com, and (4) a Multinomial Na{\"i}ve Bayes (MNB) machine learning technique. These different methods identify gender attributes based on YouTube channel names and descriptions in German but are adaptable to other languages. Our contribution will evaluate the share of identifiable channels, accuracy and meaningfulness of classification, as well as limits and benefits of each approach. We aim to address methodological challenges connected to classifying gender attributes for YouTube channels as well as related to reinforcing stereotypes and ethical implications.}, language = {en} } @article{SmirnovBerrendorfShpritsetal.2020, author = {Smirnov, Artem and Berrendorf, Max and Shprits, Yuri Y. and Kronberg, Elena A. and Allison, Hayley J. and Aseev, Nikita and Zhelavskaya, Irina and Morley, Steven K. and Reeves, Geoffrey D. and Carver, Matthew R. and Effenberger, Frederic}, title = {Medium energy electron flux in earth's outer radiation belt (MERLIN)}, series = {Space weather : the international journal of research and applications}, volume = {18}, journal = {Space weather : the international journal of research and applications}, number = {11}, publisher = {American geophysical union, AGU}, address = {Washington}, issn = {1542-7390}, doi = {10.1029/2020SW002532}, pages = {20}, year = {2020}, abstract = {The radiation belts of the Earth, filled with energetic electrons, comprise complex and dynamic systems that pose a significant threat to satellite operation. While various models of electron flux both for low and relativistic energies have been developed, the behavior of medium energy (120-600 keV) electrons, especially in the MEO region, remains poorly quantified. At these energies, electrons are driven by both convective and diffusive transport, and their prediction usually requires sophisticated 4D modeling codes. In this paper, we present an alternative approach using the Light Gradient Boosting (LightGBM) machine learning algorithm. The Medium Energy electRon fLux In Earth's outer radiatioN belt (MERLIN) model takes as input the satellite position, a combination of geomagnetic indices and solar wind parameters including the time history of velocity, and does not use persistence. MERLIN is trained on >15 years of the GPS electron flux data and tested on more than 1.5 years of measurements. Tenfold cross validation yields that the model predicts the MEO radiation environment well, both in terms of dynamics and amplitudes o f flux. Evaluation on the test set shows high correlation between the predicted and observed electron flux (0.8) and low values of absolute error. The MERLIN model can have wide space weather applications, providing information for the scientific community in the form of radiation belts reconstructions, as well as industry for satellite mission design, nowcast of the MEO environment, and surface charging analysis.}, language = {en} } @article{SprengerErbanSeddigetal.2017, author = {Sprenger, Heike and Erban, Alexander and Seddig, Sylvia and Rudack, Katharina and Thalhammer, Anja and Le, Mai Q. and Walther, Dirk and Zuther, Ellen and Koehl, Karin I. and Kopka, Joachim and Hincha, Dirk K.}, title = {Metabolite and transcript markers for the prediction of potato drought tolerance}, series = {Plant Biotechnology Journal}, volume = {16}, journal = {Plant Biotechnology Journal}, number = {4}, publisher = {Wiley}, address = {Hoboken}, issn = {1467-7644}, doi = {10.1111/pbi.12840}, pages = {939 -- 950}, year = {2017}, abstract = {Potato (Solanum tuberosum L.) is one of the most important food crops worldwide. Current potato varieties are highly susceptible to drought stress. In view of global climate change, selection of cultivars with improved drought tolerance and high yield potential is of paramount importance. Drought tolerance breeding of potato is currently based on direct selection according to yield and phenotypic traits and requires multiple trials under drought conditions. Marker-assisted selection (MAS) is cheaper, faster and reduces classification errors caused by noncontrolled environmental effects. We analysed 31 potato cultivars grown under optimal and reduced water supply in six independent field trials. Drought tolerance was determined as tuber starch yield. Leaf samples from young plants were screened for preselected transcript and nontargeted metabolite abundance using qRT-PCR and GC-MS profiling, respectively. Transcript marker candidates were selected from a published RNA-Seq data set. A Random Forest machine learning approach extracted metabolite and transcript markers for drought tolerance prediction with low error rates of 6\% and 9\%, respectively. Moreover, by combining transcript and metabolite markers, the prediction error was reduced to 4.3\%. Feature selection from Random Forest models allowed model minimization, yielding a minimal combination of only 20 metabolite and transcript markers that were successfully tested for their reproducibility in 16 independent agronomic field trials. We demonstrate that a minimum combination of transcript and metabolite markers sampled at early cultivation stages predicts potato yield stability under drought largely independent of seasonal and regional agronomic conditions.}, language = {en} } @article{SteinbergVasyuraBathkeGaebleretal.2021, author = {Steinberg, Andreas and Vasyura-Bathke, Hannes and Gaebler, Peter Jost and Ohrnberger, Matthias and Ceranna, Lars}, title = {Estimation of seismic moment tensors using variational inference machine learning}, series = {Journal of geophysical research : Solid earth}, volume = {126}, journal = {Journal of geophysical research : Solid earth}, number = {10}, publisher = {American Geophysical Union}, address = {Washington}, issn = {2169-9313}, doi = {10.1029/2021JB022685}, pages = {16}, year = {2021}, abstract = {We present an approach for rapidly estimating full moment tensors of earthquakes and their parameter uncertainties based on short time windows of recorded seismic waveform data by considering deep learning of Bayesian Neural Networks (BNNs). The individual neural networks are trained on synthetic seismic waveform data and corresponding known earthquake moment-tensor parameters. A monitoring volume has been predefined to form a three-dimensional grid of locations and to train a BNN for each grid point. Variational inference on several of these networks allows us to consider several sources of error and how they affect the estimated full moment-tensor parameters and their uncertainties. In particular, we demonstrate how estimated parameter distributions are affected by uncertainties in the earthquake centroid location in space and time as well as in the assumed Earth structure model. We apply our approach as a proof of concept on seismic waveform recordings of aftershocks of the Ridgecrest 2019 earthquake with moment magnitudes ranging from Mw 2.7 to Mw 5.5. Overall, good agreement has been achieved between inferred parameter ensembles and independently estimated parameters using classical methods. Our developed approach is fast and robust, and therefore, suitable for down-stream analyses that need rapid estimates of the source mechanism for a large number of earthquakes.}, language = {en} } @article{TongNikoloski2020, author = {Tong, Hao and Nikoloski, Zoran}, title = {Machine learning approaches for crop improvement}, series = {Journal of plant physiology : biochemistry, physiology, molecular biology and biotechnology of plants}, volume = {257}, journal = {Journal of plant physiology : biochemistry, physiology, molecular biology and biotechnology of plants}, publisher = {Elsevier}, address = {M{\"u}nchen}, issn = {0176-1617}, doi = {10.1016/j.jplph.2020.153354}, pages = {13}, year = {2020}, abstract = {Highly efficient and accurate selection of elite genotypes can lead to dramatic shortening of the breeding cycle in major crops relevant for sustaining present demands for food, feed, and fuel. In contrast to classical approaches that emphasize the need for resource-intensive phenotyping at all stages of artificial selection, genomic selection dramatically reduces the need for phenotyping. Genomic selection relies on advances in machine learning and the availability of genotyping data to predict agronomically relevant phenotypic traits. Here we provide a systematic review of machine learning approaches applied for genomic selection of single and multiple traits in major crops in the past decade. We emphasize the need to gather data on intermediate phenotypes, e.g. metabolite, protein, and gene expression levels, along with developments of modeling techniques that can lead to further improvements of genomic selection. In addition, we provide a critical view of factors that affect genomic selection, with attention to transferability of models between different environments. Finally, we highlight the future aspects of integrating high-throughput molecular phenotypic data from omics technologies with biological networks for crop improvement.}, language = {en} } @article{VaidChanChaudharyetal.2021, author = {Vaid, Akhil and Chan, Lili and Chaudhary, Kumardeep and Jaladanki, Suraj K. and Paranjpe, Ishan and Russak, Adam J. and Kia, Arash and Timsina, Prem and Levin, Matthew A. and He, John Cijiang and B{\"o}ttinger, Erwin and Charney, Alexander W. and Fayad, Zahi A. and Coca, Steven G. and Glicksberg, Benjamin S. and Nadkarni, Girish N.}, title = {Predictive approaches for acute dialysis requirement and death in COVID-19}, series = {Clinical journal of the American Society of Nephrology : CJASN}, volume = {16}, journal = {Clinical journal of the American Society of Nephrology : CJASN}, number = {8}, publisher = {American Society of Nephrology}, address = {Washington}, organization = {MSCIC}, issn = {1555-9041}, doi = {10.2215/CJN.17311120}, pages = {1158 -- 1168}, year = {2021}, abstract = {Background and objectives AKI treated with dialysis initiation is a common complication of coronavirus disease 2019 (COVID-19) among hospitalized patients. However, dialysis supplies and personnel are often limited. Design, setting, participants, \& measurements Using data from adult patients hospitalized with COVID-19 from five hospitals from theMount Sinai Health System who were admitted between March 10 and December 26, 2020, we developed and validated several models (logistic regression, Least Absolute Shrinkage and Selection Operator (LASSO), random forest, and eXtreme GradientBoosting [XGBoost; with and without imputation]) for predicting treatment with dialysis or death at various time horizons (1, 3, 5, and 7 days) after hospital admission. Patients admitted to theMount Sinai Hospital were used for internal validation, whereas the other hospitals formed part of the external validation cohort. Features included demographics, comorbidities, and laboratory and vital signs within 12 hours of hospital admission. Results A total of 6093 patients (2442 in training and 3651 in external validation) were included in the final cohort. Of the different modeling approaches used, XGBoost without imputation had the highest area under the receiver operating characteristic (AUROC) curve on internal validation (range of 0.93-0.98) and area under the precisionrecall curve (AUPRC; range of 0.78-0.82) for all time points. XGBoost without imputation also had the highest test parameters on external validation (AUROC range of 0.85-0.87, and AUPRC range of 0.27-0.54) across all time windows. XGBoost without imputation outperformed all models with higher precision and recall (mean difference in AUROC of 0.04; mean difference in AUPRC of 0.15). Features of creatinine, BUN, and red cell distribution width were major drivers of the model's prediction. Conclusions An XGBoost model without imputation for prediction of a composite outcome of either death or dialysis in patients positive for COVID-19 had the best performance, as compared with standard and other machine learning models.}, language = {en} } @article{VaidSomaniRussaketal.2020, author = {Vaid, Akhil and Somani, Sulaiman and Russak, Adam J. and De Freitas, Jessica K. and Chaudhry, Fayzan F. and Paranjpe, Ishan and Johnson, Kipp W. and Lee, Samuel J. and Miotto, Riccardo and Richter, Felix and Zhao, Shan and Beckmann, Noam D. and Naik, Nidhi and Kia, Arash and Timsina, Prem and Lala, Anuradha and Paranjpe, Manish and Golden, Eddye and Danieletto, Matteo and Singh, Manbir and Meyer, Dara and O'Reilly, Paul F. and Huckins, Laura and Kovatch, Patricia and Finkelstein, Joseph and Freeman, Robert M. and Argulian, Edgar and Kasarskis, Andrew and Percha, Bethany and Aberg, Judith A. and Bagiella, Emilia and Horowitz, Carol R. and Murphy, Barbara and Nestler, Eric J. and Schadt, Eric E. and Cho, Judy H. and Cordon-Cardo, Carlos and Fuster, Valentin and Charney, Dennis S. and Reich, David L. and B{\"o}ttinger, Erwin and Levin, Matthew A. and Narula, Jagat and Fayad, Zahi A. and Just, Allan C. and Charney, Alexander W. and Nadkarni, Girish N. and Glicksberg, Benjamin S.}, title = {Machine learning to predict mortality and critical events in a cohort of patients with COVID-19 in New York City: model development and validation}, series = {Journal of medical internet research : international scientific journal for medical research, information and communication on the internet ; JMIR}, volume = {22}, journal = {Journal of medical internet research : international scientific journal for medical research, information and communication on the internet ; JMIR}, number = {11}, publisher = {Healthcare World}, address = {Richmond, Va.}, issn = {1439-4456}, doi = {10.2196/24018}, pages = {19}, year = {2020}, abstract = {Background: COVID-19 has infected millions of people worldwide and is responsible for several hundred thousand fatalities. The COVID-19 pandemic has necessitated thoughtful resource allocation and early identification of high-risk patients. However, effective methods to meet these needs are lacking. Objective: The aims of this study were to analyze the electronic health records (EHRs) of patients who tested positive for COVID-19 and were admitted to hospitals in the Mount Sinai Health System in New York City; to develop machine learning models for making predictions about the hospital course of the patients over clinically meaningful time horizons based on patient characteristics at admission; and to assess the performance of these models at multiple hospitals and time points. Methods: We used Extreme Gradient Boosting (XGBoost) and baseline comparator models to predict in-hospital mortality and critical events at time windows of 3, 5, 7, and 10 days from admission. Our study population included harmonized EHR data from five hospitals in New York City for 4098 COVID-19-positive patients admitted from March 15 to May 22, 2020. The models were first trained on patients from a single hospital (n=1514) before or on May 1, externally validated on patients from four other hospitals (n=2201) before or on May 1, and prospectively validated on all patients after May 1 (n=383). Finally, we established model interpretability to identify and rank variables that drive model predictions. Results: Upon cross-validation, the XGBoost classifier outperformed baseline models, with an area under the receiver operating characteristic curve (AUC-ROC) for mortality of 0.89 at 3 days, 0.85 at 5 and 7 days, and 0.84 at 10 days. XGBoost also performed well for critical event prediction, with an AUC-ROC of 0.80 at 3 days, 0.79 at 5 days, 0.80 at 7 days, and 0.81 at 10 days. In external validation, XGBoost achieved an AUC-ROC of 0.88 at 3 days, 0.86 at 5 days, 0.86 at 7 days, and 0.84 at 10 days for mortality prediction. Similarly, the unimputed XGBoost model achieved an AUC-ROC of 0.78 at 3 days, 0.79 at 5 days, 0.80 at 7 days, and 0.81 at 10 days. Trends in performance on prospective validation sets were similar. At 7 days, acute kidney injury on admission, elevated LDH, tachypnea, and hyperglycemia were the strongest drivers of critical event prediction, while higher age, anion gap, and C-reactive protein were the strongest drivers of mortality prediction. Conclusions: We externally and prospectively trained and validated machine learning models for mortality and critical events for patients with COVID-19 at different time horizons. These models identified at-risk patients and uncovered underlying relationships that predicted outcomes.}, language = {en} } @article{WilkschAbramova2023, author = {Wilksch, Moritz and Abramova, Olga}, title = {PyFin-sentiment}, series = {International journal of information management data insights}, volume = {3}, journal = {International journal of information management data insights}, number = {1}, publisher = {Elsevier}, address = {Amsterdam}, issn = {2667-0968}, doi = {10.1016/j.jjimei.2023.100171}, pages = {10}, year = {2023}, abstract = {Responding to the poor performance of generic automated sentiment analysis solutions on domain-specific texts, we collect a dataset of 10,000 tweets discussing the topics of finance and investing. We manually assign each tweet its market sentiment, i.e., the investor's anticipation of a stock's future return. Using this data, we show that all existing sentiment models trained on adjacent domains struggle with accurate market sentiment analysis due to the task's specialized vocabulary. Consequently, we design, train, and deploy our own sentiment model. It outperforms all previous models (VADER, NTUSD-Fin, FinBERT, TwitterRoBERTa) when evaluated on Twitter posts. On posts from a different platform, our model performs on par with BERT-based large language models. We achieve this result at a fraction of the training and inference costs due to the model's simple design. We publish the artifact as a python library to facilitate its use by future researchers and practitioners.}, language = {en} } @article{WulffBuschhueterWestphaletal.2020, author = {Wulff, Peter and Buschh{\"u}ter, David and Westphal, Andrea and Nowak, Anna and Becker, Lisa and Robalino, Hugo and Stede, Manfred and Borowski, Andreas}, title = {Computer-based classification of preservice physics teachers' written reflections}, series = {Journal of science education and technology}, volume = {30}, journal = {Journal of science education and technology}, number = {1}, publisher = {Springer}, address = {Dordrecht}, issn = {1059-0145}, doi = {10.1007/s10956-020-09865-1}, pages = {1 -- 15}, year = {2020}, abstract = {Reflecting in written form on one's teaching enactments has been considered a facilitator for teachers' professional growth in university-based preservice teacher education. Writing a structured reflection can be facilitated through external feedback. However, researchers noted that feedback in preservice teacher education often relies on holistic, rather than more content-based, analytic feedback because educators oftentimes lack resources (e.g., time) to provide more analytic feedback. To overcome this impediment to feedback for written reflection, advances in computer technology can be of use. Hence, this study sought to utilize techniques of natural language processing and machine learning to train a computer-based classifier that classifies preservice physics teachers' written reflections on their teaching enactments in a German university teacher education program. To do so, a reflection model was adapted to physics education. It was then tested to what extent the computer-based classifier could accurately classify the elements of the reflection model in segments of preservice physics teachers' written reflections. Multinomial logistic regression using word count as a predictor was found to yield acceptable average human-computer agreement (F1-score on held-out test dataset of 0.56) so that it might fuel further development towards an automated feedback tool that supplements existing holistic feedback for written reflections with data-based, analytic feedback.}, language = {en} } @article{WulffMientusNowaketal.2023, author = {Wulff, Peter and Mientus, Lukas and Nowak, Anna and Borowski, Andreas}, title = {KI-basierte Auswertung von schriftlichen Unterrichtsreflexionen im Fach Physik und automatisierte R{\"u}ckmeldung}, series = {PSI-Potsdam: Ergebnisbericht zu den Aktivit{\"a}ten im Rahmen der Qualit{\"a}tsoffensive Lehrerbildung (2019-2023) (Potsdamer Beitr{\"a}ge zur Lehrerbildung und Bildungsforschung ; 3)}, journal = {PSI-Potsdam: Ergebnisbericht zu den Aktivit{\"a}ten im Rahmen der Qualit{\"a}tsoffensive Lehrerbildung (2019-2023) (Potsdamer Beitr{\"a}ge zur Lehrerbildung und Bildungsforschung ; 3)}, number = {3}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-568-2}, issn = {2626-3556}, doi = {10.25932/publishup-61636}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-616363}, pages = {103 -- 115}, year = {2023}, abstract = {F{\"u}r die Entwicklung professioneller Handlungskompetenzen angehender Lehrkr{\"a}fte stellt die Unterrichtsreflexion ein wichtiges Instrument dar, um Theoriewissen und Praxiserfahrungen in Beziehung zu setzen. Die Auswertung von Unterrichtsreflexionen und eine entsprechende R{\"u}ckmeldung stellt Forschende und Dozierende allerdings vor praktische wie theoretische Herausforderungen. Im Kontext der Forschung zu K{\"u}nstlicher Intelligenz (KI) entwickelte Methoden bieten hier neue Potenziale. Der Beitrag stellt {\"u}berblicksartig zwei Teilstudien vor, die mit Hilfe von KI-Methoden wie dem maschinellen Lernen untersuchen, inwieweit eine Auswertung von Unterrichtsreflexionen angehender Physiklehrkr{\"a}fte auf Basis eines theoretisch abgeleiteten Reflexionsmodells und die automatisierte R{\"u}ckmeldung hierzu m{\"o}glich sind. Dabei wurden unterschiedliche Ans{\"a}tze des maschinellen Lernens verwendet, um modellbasierte Klassifikation und Exploration von Themen in Unterrichtsreflexionen umzusetzen. Die Genauigkeit der Ergebnisse wurde vor allem durch sog. Große Sprachmodelle gesteigert, die auch den Transfer auf andere Standorte und F{\"a}cher erm{\"o}glichen. F{\"u}r die fachdidaktische Forschung bedeuten sie jedoch wiederum neue Herausforderungen, wie etwa systematische Verzerrungen und Intransparenz von Entscheidungen. Dennoch empfehlen wir, die Potenziale der KI-basierten Methoden gr{\"u}ndlicher zu erforschen und konsequent in der Praxis (etwa in Form von Webanwendungen) zu implementieren.}, language = {de} }