@article{KossmannHalfpapJankriftetal.2020, author = {Kossmann, Jan and Halfpap, Stefan and Jankrift, Marcel and Schlosser, Rainer}, title = {Magic mirror in my hand, which is the best in the land?}, series = {Proceedings of the VLDB Endowment}, volume = {13}, journal = {Proceedings of the VLDB Endowment}, number = {11}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3407790.3407832}, pages = {2382 -- 2395}, year = {2020}, abstract = {Indexes are essential for the efficient processing of database workloads. Proposed solutions for the relevant and challenging index selection problem range from metadata-based simple heuristics, over sophisticated multi-step algorithms, to approaches that yield optimal results. The main challenges are (i) to accurately determine the effect of an index on the workload cost while considering the interaction of indexes and (ii) a large number of possible combinations resulting from workloads containing many queries and massive schemata with possibly thousands of attributes.
In this work, we describe and analyze eight index selection algorithms that are based on different concepts and compare them along different dimensions, such as solution quality, runtime, multi-column support, solution granularity, and complexity. In particular, we analyze the solutions of the algorithms for the challenging analytical Join Order, TPC-H, and TPC-DS benchmarks. Afterward, we assess strengths and weaknesses, infer insights for index selection in general and each approach individually, before we give recommendations on when to use which approach.}, language = {en} } @article{KoumarelasJiangNaumann2020, author = {Koumarelas, Ioannis and Jiang, Lan and Naumann, Felix}, title = {Data preparation for duplicate detection}, series = {Journal of data and information quality : (JDIQ)}, volume = {12}, journal = {Journal of data and information quality : (JDIQ)}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3377878}, pages = {24}, year = {2020}, abstract = {Data errors represent a major issue in most application workflows. Before any important task can take place, a certain data quality has to be guaranteed by eliminating a number of different errors that may appear in data. Typically, most of these errors are fixed with data preparation methods, such as whitespace removal. However, the particular error of duplicate records, where multiple records refer to the same entity, is usually eliminated independently with specialized techniques. Our work is the first to bring these two areas together by applying data preparation operations under a systematic approach prior to performing duplicate detection.
Our process workflow can be summarized as follows: It begins with the user providing as input a sample of the gold standard, the actual dataset, and optionally some constraints to domain-specific data preparations, such as address normalization. The preparation selection operates in two consecutive phases. First, to vastly reduce the search space of ineffective data preparations, decisions are made based on the improvement or worsening of pair similarities. Second, using the remaining data preparations an iterative leave-one-out classification process removes preparations one by one and determines the redundant preparations based on the achieved area under the precision-recall curve (AUC-PR). Using this workflow, we manage to improve the results of duplicate detection up to 19\% in AUC-PR.}, language = {en} } @article{KossmannSchlosser2020, author = {Kossmann, Jan and Schlosser, Rainer}, title = {Self-driving database systems}, series = {Distributed and parallel databases}, volume = {38}, journal = {Distributed and parallel databases}, number = {4}, publisher = {Springer}, address = {Dordrecht}, issn = {0926-8782}, doi = {10.1007/s10619-020-07288-w}, pages = {795 -- 817}, year = {2020}, abstract = {Challenges for self-driving database systems, which tune their physical design and configuration autonomously, are manifold: Such systems have to anticipate future workloads, find robust configurations efficiently, and incorporate knowledge gained by previous actions into later decisions. We present a component-based framework for self-driving database systems that enables database integration and development of self-managing functionality with low overhead by relying on separation of concerns. By keeping the components of the framework reusable and exchangeable, experiments are simplified, which promotes further research in that area. Moreover, to optimize multiple mutually dependent features, e.g., index selection and compression configurations, we propose a linear programming (LP) based algorithm to derive an efficient tuning order automatically. Afterwards, we demonstrate the applicability and scalability of our approach with reproducible examples.}, language = {en} } @article{HartungBorghardt2020, author = {Hartung, Niklas and Borghardt, Jens Markus}, title = {A mechanistic framework for a priori pharmacokinetic predictions of orally inhaled drugs}, series = {PLoS Computational Biology : a new community journal}, volume = {16}, journal = {PLoS Computational Biology : a new community journal}, number = {12}, publisher = {PLoS}, address = {San Fransisco}, issn = {1553-734X}, doi = {10.1371/journal.pcbi.1008466}, pages = {24}, year = {2020}, abstract = {Author summary
The use of orally inhaled drugs for treating lung diseases is appealing since they have the potential for lung selectivity, i.e. high exposure at the site of action -the lung- without excessive side effects. However, the degree of lung selectivity depends on a large number of factors, including physiochemical properties of drug molecules, patient disease state, and inhalation devices. To predict the impact of these factors on drug exposure and thereby to understand the characteristics of an optimal drug for inhalation, we develop a predictive mathematical framework (a "pharmacokinetic model"). In contrast to previous approaches, our model allows combining knowledge from different sources appropriately and its predictions were able to adequately predict different sets of clinical data. Finally, we compare the impact of different factors and find that the most important factors are the size of the inhaled particles, the affinity of the drug to the lung tissue, as well as the rate of drug dissolution in the lung. In contrast to the common belief, the solubility of a drug in the lining fluids is not found to be relevant. These findings are important to understand how inhaled drugs should be designed to achieve best treatment results in patients.
The fate of orally inhaled drugs is determined by pulmonary pharmacokinetic processes such as particle deposition, pulmonary drug dissolution, and mucociliary clearance. Even though each single process has been systematically investigated, a quantitative understanding on the interaction of processes remains limited and therefore identifying optimal drug and formulation characteristics for orally inhaled drugs is still challenging. To investigate this complex interplay, the pulmonary processes can be integrated into mathematical models. However, existing modeling attempts considerably simplify these processes or are not systematically evaluated against (clinical) data. In this work, we developed a mathematical framework based on physiologically-structured population equations to integrate all relevant pulmonary processes mechanistically. A tailored numerical resolution strategy was chosen and the mechanistic model was evaluated systematically against data from different clinical studies. Without adapting the mechanistic model or estimating kinetic parameters based on individual study data, the developed model was able to predict simultaneously (i) lung retention profiles of inhaled insoluble particles, (ii) particle size-dependent pharmacokinetics of inhaled monodisperse particles, (iii) pharmacokinetic differences between inhaled fluticasone propionate and budesonide, as well as (iv) pharmacokinetic differences between healthy volunteers and asthmatic patients. Finally, to identify the most impactful optimization criteria for orally inhaled drugs, the developed mechanistic model was applied to investigate the impact of input parameters on both the pulmonary and systemic exposure. Interestingly, the solubility of the inhaled drug did not have any relevant impact on the local and systemic pharmacokinetics. Instead, the pulmonary dissolution rate, the particle size, the tissue affinity, and the systemic clearance were the most impactful potential optimization parameters. In the future, the developed prediction framework should be considered a powerful tool for identifying optimal drug and formulation characteristics.}, language = {en} } @article{BludauBrueggemannBuschetal.2020, author = {Bludau, Mark-Jan and Br{\"u}ggemann, Viktoria and Busch, Anke and D{\"o}rk, Marian}, title = {Reading traces}, series = {Computer graphics forum : journal of the European Association for Computer Graphics}, volume = {39}, journal = {Computer graphics forum : journal of the European Association for Computer Graphics}, number = {3}, publisher = {Wiley}, address = {Hoboken}, issn = {0167-7055}, doi = {10.1111/cgf.13964}, pages = {77 -- 87}, year = {2020}, abstract = {Through a design study, we develop an approach to data exploration that utilizes elastic visualizations designed to support varying degrees of detail and abstraction. Examining the notions of scalability and elasticity in interactive visualizations, we introduce a visualization of personal reading traces such as marginalia or markings inside the reference library of German realist author Theodor Fontane. To explore such a rich and extensive collection, meaningful visual forms of abstraction and detail are as important as the transitions between those states. Following a growing research interest in the role of fluid interactivity and animations between views, we are particularly interested in the potential of carefully designed transitions and consistent representations across scales. The resulting prototype addresses humanistic research questions about the interplay of distant and close reading with visualization research on continuous navigation along several granularity levels, using scrolling as one of the main interaction mechanisms. In addition to presenting the design process and resulting prototype, we present findings from a qualitative evaluation of the tool, which suggest that bridging between distant and close views can enhance exploration, but that transitions between views need to be crafted very carefully to facilitate comprehension.}, language = {en} } @phdthesis{Taeumel2020, author = {Taeumel, Marcel}, title = {Data-driven tool construction in exploratory programming environments}, doi = {10.25932/publishup-44428}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-444289}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 299}, year = {2020}, abstract = {This work presents a new design for programming environments that promote the exploration of domain-specific software artifacts and the construction of graphical tools for such program comprehension tasks. In complex software projects, tool building is essential because domain- or task-specific tools can support decision making by representing concerns concisely with low cognitive effort. In contrast, generic tools can only support anticipated scenarios, which usually align with programming language concepts or well-known project domains. However, the creation and modification of interactive tools is expensive because the glue that connects data to graphics is hard to find, change, and test. Even if valuable data is available in a common format and even if promising visualizations could be populated, programmers have to invest many resources to make changes in the programming environment. Consequently, only ideas of predictably high value will be implemented. In the non-graphical, command-line world, the situation looks different and inspiring: programmers can easily build their own tools as shell scripts by configuring and combining filter programs to process data. We propose a new perspective on graphical tools and provide a concept to build and modify such tools with a focus on high quality, low effort, and continuous adaptability. That is, (1) we propose an object-oriented, data-driven, declarative scripting language that reduces the amount of and governs the effects of glue code for view-model specifications, and (2) we propose a scalable UI-design language that promotes short feedback loops in an interactive, graphical environment such as Morphic known from Self or Squeak/Smalltalk systems. We implemented our concept as a tool building environment, which we call VIVIDE, on top of Squeak/Smalltalk and Morphic. We replaced existing code browsing and debugging tools to iterate within our solution more quickly. In several case studies with undergraduate and graduate students, we observed that VIVIDE can be applied to many domains such as live language development, source-code versioning, modular code browsing, and multi-language debugging. Then, we designed a controlled experiment to measure the effect on the time to build tools. Several pilot runs showed that training is crucial and, presumably, takes days or weeks, which implies a need for further research. As a result, programmers as users can directly work with tangible representations of their software artifacts in the VIVIDE environment. Tool builders can write domain-specific scripts to populate views to approach comprehension tasks from different angles. Our novel perspective on graphical tools can inspire the creation of new trade-offs in modularity for both data providers and view designers.}, language = {en} } @article{HollmannFrohmeEndrullatetal.2020, author = {Hollmann, Susanne and Frohme, Marcus and Endrullat, Christoph and Kremer, Andreas and D'Elia, Domenica and Regierer, Babette and Nechyporenko, Alina}, title = {Ten simple rules on how to write a standard operating procedure}, series = {PLOS Computational Biology}, volume = {16}, journal = {PLOS Computational Biology}, number = {9}, publisher = {PLOS}, address = {San Francisco}, pages = {10}, year = {2020}, abstract = {Research publications and data nowadays should be publicly available on the internet and, theoretically, usable for everyone to develop further research, products, or services. The long-term accessibility of research data is, therefore, fundamental in the economy of the research production process. However, the availability of data is not sufficient by itself, but also their quality must be verifiable. Measures to ensure reuse and reproducibility need to include the entire research life cycle, from the experimental design to the generation of data, quality control, statistical analysis, interpretation, and validation of the results. Hence, high-quality records, particularly for providing a string of documents for the verifiable origin of data, are essential elements that can act as a certificate for potential users (customers). These records also improve the traceability and transparency of data and processes, therefore, improving the reliability of results. Standards for data acquisition, analysis, and documentation have been fostered in the last decade driven by grassroot initiatives of researchers and organizations such as the Research Data Alliance (RDA). Nevertheless, what is still largely missing in the life science academic research are agreed procedures for complex routine research workflows. Here, well-crafted documentation like standard operating procedures (SOPs) offer clear direction and instructions specifically designed to avoid deviations as an absolute necessity for reproducibility. Therefore, this paper provides a standardized workflow that explains step by step how to write an SOP to be used as a starting point for appropriate research documentation.}, language = {en} } @article{ChauhanFriedrichRothenberger2020, author = {Chauhan, Ankit and Friedrich, Tobias and Rothenberger, Ralf}, title = {Greed is good for deterministic scale-free networks}, series = {Algorithmica : an international journal in computer science}, volume = {82}, journal = {Algorithmica : an international journal in computer science}, number = {11}, publisher = {Springer}, address = {New York}, issn = {0178-4617}, doi = {10.1007/s00453-020-00729-z}, pages = {3338 -- 3389}, year = {2020}, abstract = {Large real-world networks typically follow a power-law degree distribution. To study such networks, numerous random graph models have been proposed. However, real-world networks are not drawn at random. Therefore, Brach et al. (27th symposium on discrete algorithms (SODA), pp 1306-1325, 2016) introduced two natural deterministic conditions: (1) a power-law upper bound on the degree distribution (PLB-U) and (2) power-law neighborhoods, that is, the degree distribution of neighbors of each vertex is also upper bounded by a power law (PLB-N). They showed that many real-world networks satisfy both properties and exploit them to design faster algorithms for a number of classical graph problems. We complement their work by showing that some well-studied random graph models exhibit both of the mentioned PLB properties. PLB-U and PLB-N hold with high probability for Chung-Lu Random Graphs and Geometric Inhomogeneous Random Graphs and almost surely for Hyperbolic Random Graphs. As a consequence, all results of Brach et al. also hold with high probability or almost surely for those random graph classes. In the second part we study three classical NP-hard optimization problems on PLB networks. It is known that on general graphs with maximum degree Delta, a greedy algorithm, which chooses nodes in the order of their degree, only achieves a Omega (ln Delta)-approximation forMinimum Vertex Cover and Minimum Dominating Set, and a Omega(Delta)-approximation forMaximum Independent Set. We prove that the PLB-U property with beta>2 suffices for the greedy approach to achieve a constant-factor approximation for all three problems. We also show that these problems are APX-hard even if PLB-U, PLB-N, and an additional power-law lower bound on the degree distribution hold. Hence, a PTAS cannot be expected unless P = NP. Furthermore, we prove that all three problems are in MAX SNP if the PLB-U property holds.}, language = {en} } @article{BarkowskyGiese2020, author = {Barkowsky, Matthias and Giese, Holger}, title = {Hybrid search plan generation for generalized graph pattern matching}, series = {Journal of logical and algebraic methods in programming}, volume = {114}, journal = {Journal of logical and algebraic methods in programming}, publisher = {Elsevier}, address = {New York}, issn = {2352-2208}, doi = {10.1016/j.jlamp.2020.100563}, pages = {29}, year = {2020}, abstract = {In recent years, the increased interest in application areas such as social networks has resulted in a rising popularity of graph-based approaches for storing and processing large amounts of interconnected data. To extract useful information from the growing network structures, efficient querying techniques are required. In this paper, we propose an approach for graph pattern matching that allows a uniform handling of arbitrary constraints over the query vertices. Our technique builds on a previously introduced matching algorithm, which takes concrete host graph information into account to dynamically adapt the employed search plan during query execution. The dynamic algorithm is combined with an existing static approach for search plan generation, resulting in a hybrid technique which we further extend by a more sophisticated handling of filtering effects caused by constraint checks. We evaluate the presented concepts empirically based on an implementation for our graph pattern matching tool, the Story Diagram Interpreter, with queries and data provided by the LDBC Social Network Benchmark. Our results suggest that the hybrid technique may improve search efficiency in several cases, and rarely reduces efficiency.}, language = {en} } @article{HackerKrestelGrundmannetal.2020, author = {Hacker, Philipp and Krestel, Ralf and Grundmann, Stefan and Naumann, Felix}, title = {Explainable AI under contract and tort law}, series = {Artificial intelligence and law}, volume = {28}, journal = {Artificial intelligence and law}, number = {4}, publisher = {Springer}, address = {Dordrecht}, issn = {0924-8463}, doi = {10.1007/s10506-020-09260-6}, pages = {415 -- 439}, year = {2020}, abstract = {This paper shows that the law, in subtle ways, may set hitherto unrecognized incentives for the adoption of explainable machine learning applications. In doing so, we make two novel contributions. First, on the legal side, we show that to avoid liability, professional actors, such as doctors and managers, may soon be legally compelled to use explainable ML models. We argue that the importance of explainability reaches far beyond data protection law, and crucially influences questions of contractual and tort liability for the use of ML models. To this effect, we conduct two legal case studies, in medical and corporate merger applications of ML. As a second contribution, we discuss the (legally required) trade-off between accuracy and explainability and demonstrate the effect in a technical case study in the context of spam classification.}, language = {en} }