@article{CaruccioDeufemiaNaumannetal.2021, author = {Caruccio, Loredana and Deufemia, Vincenzo and Naumann, Felix and Polese, Giuseppe}, title = {Discovering relaxed functional dependencies based on multi-attribute dominance}, series = {IEEE transactions on knowledge and data engineering}, volume = {33}, journal = {IEEE transactions on knowledge and data engineering}, number = {9}, publisher = {Institute of Electrical and Electronics Engineers}, address = {New York, NY}, issn = {1041-4347}, doi = {10.1109/TKDE.2020.2967722}, pages = {3212 -- 3228}, year = {2021}, abstract = {With the advent of big data and data lakes, data are often integrated from multiple sources. Such integrated data are often of poor quality, due to inconsistencies, errors, and so forth. One way to check the quality of data is to infer functional dependencies (fds). However, in many modern applications it might be necessary to extract properties and relationships that are not captured through fds, due to the necessity to admit exceptions, or to consider similarity rather than equality of data values. Relaxed fds (rfds) have been introduced to meet these needs, but their discovery from data adds further complexity to an already complex problem, also due to the necessity of specifying similarity and validity thresholds. We propose Domino, a new discovery algorithm for rfds that exploits the concept of dominance in order to derive similarity thresholds of attribute values while inferring rfds. An experimental evaluation on real datasets demonstrates the discovery performance and the effectiveness of the proposed algorithm.}, language = {en} } @article{GronauSchaefer2021, author = {Gronau, Norbert and Schaefer, Martin}, title = {Why metadata matters for the future of copyright}, series = {European Intellectual Property Review}, volume = {43}, journal = {European Intellectual Property Review}, number = {8}, publisher = {Sweet \& Maxwell}, address = {London}, issn = {0142-0461}, pages = {488 -- 494}, year = {2021}, abstract = {In the copyright industries of the 21st century, metadata is the grease required to make the engine of copyright run smoothly and powerfully for the benefit of creators, copyright industries and users alike. However, metadata is difficult to acquire and even more difficult to keep up to date as the rights in content are mostly multi-layered, fragmented, international and volatile. This article explores the idea of a neutral metadata search and enhancement tool that could constitute a buffer to safeguard the interests of the various proprietary database owners and avoid the shortcomings of centralised databases.}, language = {en} } @article{DattaSachsFreitasdaCruzetal.2021, author = {Datta, Suparno and Sachs, Jan Philipp and Freitas da Cruz, Harry and Martensen, Tom and Bode, Philipp and Morassi Sasso, Ariane and Glicksberg, Benjamin S. and B{\"o}ttinger, Erwin}, title = {FIBER}, series = {JAMIA open}, volume = {4}, journal = {JAMIA open}, number = {3}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {2574-2531}, doi = {10.1093/jamiaopen/ooab048}, pages = {10}, year = {2021}, abstract = {Objectives: The development of clinical predictive models hinges upon the availability of comprehensive clinical data. Tapping into such resources requires considerable effort from clinicians, data scientists, and engineers. Specifically, these efforts are focused on data extraction and preprocessing steps required prior to modeling, including complex database queries. A handful of software libraries exist that can reduce this complexity by building upon data standards. However, a gap remains concerning electronic health records (EHRs) stored in star schema clinical data warehouses, an approach often adopted in practice. In this article, we introduce the FlexIBle EHR Retrieval (FIBER) tool: a Python library built on top of a star schema (i2b2) clinical data warehouse that enables flexible generation of modeling-ready cohorts as data frames. Materials and Methods: FIBER was developed on top of a large-scale star schema EHR database which contains data from 8 million patients and over 120 million encounters. To illustrate FIBER's capabilities, we present its application by building a heart surgery patient cohort with subsequent prediction of acute kidney injury (AKI) with various machine learning models. Results: Using FIBER, we were able to build the heart surgery cohort (n = 12 061), identify the patients that developed AKI (n = 1005), and automatically extract relevant features (n = 774). Finally, we trained machine learning models that achieved area under the curve values of up to 0.77 for this exemplary use case. Conclusion: FIBER is an open-source Python library developed for extracting information from star schema clinical data warehouses and reduces time-to-modeling, helping to streamline the clinical modeling process.}, language = {en} } @article{RanaOeztuerkMalik2021, author = {Rana, Kamal and {\"O}zt{\"u}rk, Ugur and Malik, Nishant}, title = {Landslide geometry reveals its trigger}, series = {Geophysical research letters : GRL / American Geophysical Union}, volume = {48}, journal = {Geophysical research letters : GRL / American Geophysical Union}, number = {4}, publisher = {American Geophysical Union}, address = {Washington}, issn = {0094-8276}, doi = {10.1029/2020GL090848}, pages = {8}, year = {2021}, abstract = {Electronic databases of landslides seldom include the triggering mechanisms, rendering these inventories unusable for landslide hazard modeling. We present a method for classifying the triggering mechanisms of landslides in existing inventories, thus, allowing these inventories to aid in landslide hazard modeling corresponding to the correct event chain. Our method uses various geometric characteristics of landslides as the feature space for the machine-learning classifier random forest, resulting in accurate and robust classifications of landslide triggers. We applied the method to six landslide inventories spread over the Japanese archipelago in several different tests and training configurations to demonstrate the effectiveness of our approach. We achieved mean accuracy ranging from 67\% to 92\%. We also provide an illustrative example of a real-world usage scenario for our method using an additional inventory with unknown ground truth. Furthermore, our feature importance analysis indicates that landslides having identical trigger mechanisms exhibit similar geometric properties.}, language = {en} } @article{KlieNikoloskiSelbig2014, author = {Klie, Sebastian and Nikoloski, Zoran and Selbig, Joachim}, title = {Biological cluster evaluation for gene function prediction}, series = {Journal of computational biology}, volume = {21}, journal = {Journal of computational biology}, number = {6}, publisher = {Liebert}, address = {New Rochelle}, issn = {1066-5277}, doi = {10.1089/cmb.2009.0129}, pages = {428 -- 445}, year = {2014}, abstract = {Recent advances in high-throughput omics techniques render it possible to decode the function of genes by using the "guilt-by-association" principle on biologically meaningful clusters of gene expression data. However, the existing frameworks for biological evaluation of gene clusters are hindered by two bottleneck issues: (1) the choice for the number of clusters, and (2) the external measures which do not take in consideration the structure of the analyzed data and the ontology of the existing biological knowledge. Here, we address the identified bottlenecks by developing a novel framework that allows not only for biological evaluation of gene expression clusters based on existing structured knowledge, but also for prediction of putative gene functions. The proposed framework facilitates propagation of statistical significance at each of the following steps: (1) estimating the number of clusters, (2) evaluating the clusters in terms of novel external structural measures, (3) selecting an optimal clustering algorithm, and (4) predicting gene functions. The framework also includes a method for evaluation of gene clusters based on the structure of the employed ontology. Moreover, our method for obtaining a probabilistic range for the number of clusters is demonstrated valid on synthetic data and available gene expression profiles from Saccharomyces cerevisiae. Finally, we propose a network-based approach for gene function prediction which relies on the clustering of optimal score and the employed ontology. Our approach effectively predicts gene function on the Saccharomyces cerevisiae data set and is also employed to obtain putative gene functions for an Arabidopsis thaliana data set.}, language = {en} }