@phdthesis{Abedjan2014, author = {Abedjan, Ziawasch}, title = {Improving RDF data with data mining}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-71334}, school = {Universit{\"a}t Potsdam}, year = {2014}, abstract = {Linked Open Data (LOD) comprises very many and often large public data sets and knowledge bases. Those datasets are mostly presented in the RDF triple structure of subject, predicate, and object, where each triple represents a statement or fact. Unfortunately, the heterogeneity of available open data requires significant integration steps before it can be used in applications. Meta information, such as ontological definitions and exact range definitions of predicates, are desirable and ideally provided by an ontology. However in the context of LOD, ontologies are often incomplete or simply not available. Thus, it is useful to automatically generate meta information, such as ontological dependencies, range definitions, and topical classifications. Association rule mining, which was originally applied for sales analysis on transactional databases, is a promising and novel technique to explore such data. We designed an adaptation of this technique for min-ing Rdf data and introduce the concept of "mining configurations", which allows us to mine RDF data sets in various ways. Different configurations enable us to identify schema and value dependencies that in combination result in interesting use cases. To this end, we present rule-based approaches for auto-completion, data enrichment, ontology improvement, and query relaxation. Auto-completion remedies the problem of inconsistent ontology usage, providing an editing user with a sorted list of commonly used predicates. A combination of different configurations step extends this approach to create completely new facts for a knowledge base. We present two approaches for fact generation, a user-based approach where a user selects the entity to be amended with new facts and a data-driven approach where an algorithm discovers entities that have to be amended with missing facts. As knowledge bases constantly grow and evolve, another approach to improve the usage of RDF data is to improve existing ontologies. Here, we present an association rule based approach to reconcile ontology and data. Interlacing different mining configurations, we infer an algorithm to discover synonymously used predicates. Those predicates can be used to expand query results and to support users during query formulation. We provide a wide range of experiments on real world datasets for each use case. The experiments and evaluations show the added value of association rule mining for the integration and usability of RDF data and confirm the appropriateness of our mining configuration methodology.}, language = {en} } @inproceedings{OPUS4-7665, title = {Proceedings of the Second HPI Cloud Symposium "Operating the Cloud" 2014}, number = {94}, editor = {Bosse, Sascha and Elsaid, Mohamed Esam and Feinbube, Frank and M{\"u}ller, Hendrik}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-319-0}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-76654}, pages = {vii, 59}, year = {2015}, abstract = {Every year, the Hasso Plattner Institute (HPI) invites guests from industry and academia to a collaborative scientific workshop on the topic "Operating the Cloud". Our goal is to provide a forum for the exchange of knowledge and experience between industry and academia. Hence, HPI's Future SOC Lab is the adequate environment to host this event which is also supported by BITKOM. On the occasion of this workshop we called for submissions of research papers and practitioners' reports. "Operating the Cloud" aims to be a platform for productive discussions of innovative ideas, visions, and upcoming technologies in the field of cloud operation and administration. In this workshop proceedings the results of the second HPI cloud symposium "Operating the Cloud" 2014 are published. We thank the authors for exciting presentations and insights into their current work and research. Moreover, we look forward to more interesting submissions for the upcoming symposium in 2015.}, language = {en} } @book{MeyerWeske2014, author = {Meyer, Andreas and Weske, Mathias}, title = {Weak conformance between process models and synchronized object life cycles}, number = {91}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-303-9}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-71722}, publisher = {Universit{\"a}t Potsdam}, pages = {31}, year = {2014}, abstract = {Process models specify behavioral execution constraints between activities as well as between activities and data objects. A data object is characterized by its states and state transitions represented as object life cycle. For process execution, all behavioral execution constraints must be correct. Correctness can be verified via soundness checking which currently only considers control flow information. For data correctness, conformance between a process model and its object life cycles is checked. Current approaches abstract from dependencies between multiple data objects and require fully specified process models although, in real-world process repositories, often underspecified models are found. Coping with these issues, we introduce the concept of synchronized object life cycles and we define a mapping of data constraints of a process model to Petri nets extending an existing mapping. Further, we apply the notion of weak conformance to process models to tell whether each time an activity needs to access a data object in a particular state, it is guaranteed that the data object is in or can reach the expected state. Then, we introduce an algorithm for an integrated verification of control flow correctness and weak data conformance using soundness checking.}, language = {en} } @book{FelgentreffBorningHirschfeld2013, author = {Felgentreff, Tim and Borning, Alan and Hirschfeld, Robert}, title = {Babelsberg : specifying and solving constraints on object behavior}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-265-0}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-67296}, publisher = {Universit{\"a}t Potsdam}, pages = {53}, year = {2013}, abstract = {Constraints allow developers to specify desired properties of systems in a number of domains, and have those properties be maintained automatically. This results in compact, declarative code, avoiding scattered code to check and imperatively re-satisfy invariants. Despite these advantages, constraint programming is not yet widespread, with standard imperative programming still the norm. There is a long history of research on integrating constraint programming with the imperative paradigm. However, this integration typically does not unify the constructs for encapsulation and abstraction from both paradigms. This impedes re-use of modules, as client code written in one paradigm can only use modules written to support that paradigm. Modules require redundant definitions if they are to be used in both paradigms. We present a language - Babelsberg - that unifies the constructs for en- capsulation and abstraction by using only object-oriented method definitions for both declarative and imperative code. Our prototype - Babelsberg/R - is an extension to Ruby, and continues to support Ruby's object-oriented se- mantics. It allows programmers to add constraints to existing Ruby programs in incremental steps by placing them on the results of normal object-oriented message sends. It is implemented by modifying a state-of-the-art Ruby virtual machine. The performance of standard object-oriented code without con- straints is only modestly impacted, with typically less than 10\% overhead compared with the unmodified virtual machine. Furthermore, our architec- ture for adding multiple constraint solvers allows Babelsberg to deal with constraints in a variety of domains. We argue that our approach provides a useful step toward making con- straint solving a generic tool for object-oriented programmers. We also provide example applications, written in our Ruby-based implementation, which use constraints in a variety of application domains, including interactive graphics, circuit simulations, data streaming with both hard and soft constraints on performance, and configuration file Management.}, language = {en} } @book{GieseBecker2013, author = {Giese, Holger and Becker, Basil}, title = {Modeling and verifying dynamic evolving service-oriented architectures}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-246-9}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-65112}, publisher = {Universit{\"a}t Potsdam}, pages = {97}, year = {2013}, abstract = {The service-oriented architecture supports the dynamic assembly and runtime reconfiguration of complex open IT landscapes by means of runtime binding of service contracts, launching of new components and termination of outdated ones. Furthermore, the evolution of these IT landscapes is not restricted to exchanging components with other ones using the same service contracts, as new services contracts can be added as well. However, current approaches for modeling and verification of service-oriented architectures do not support these important capabilities to their full extend.In this report we present an extension of the current OMG proposal for service modeling with UML - SoaML - which overcomes these limitations. It permits modeling services and their service contracts at different levels of abstraction, provides a formal semantics for all modeling concepts, and enables verifying critical properties. Our compositional and incremental verification approach allows for complex properties including communication parameters and time and covers besides the dynamic binding of service contracts and the replacement of components also the evolution of the systems by means of new service contracts. The modeling as well as verification capabilities of the presented approach are demonstrated by means of a supply chain example and the verification results of a first prototype are shown.}, language = {en} } @book{RoggeSoltiMansvanderAalstetal.2013, author = {Rogge-Solti, Andreas and Mans, Ronny S. and van der Aalst, Wil M. P. and Weske, Mathias}, title = {Repairing event logs using stochastic process models}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-258-2}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-66797}, publisher = {Universit{\"a}t Potsdam}, pages = {19}, year = {2013}, abstract = {Companies strive to improve their business processes in order to remain competitive. Process mining aims to infer meaningful insights from process-related data and attracted the attention of practitioners, tool-vendors, and researchers in recent years. Traditionally, event logs are assumed to describe the as-is situation. But this is not necessarily the case in environments where logging may be compromised due to manual logging. For example, hospital staff may need to manually enter information regarding the patient's treatment. As a result, events or timestamps may be missing or incorrect. In this paper, we make use of process knowledge captured in process models, and provide a method to repair missing events in the logs. This way, we facilitate analysis of incomplete logs. We realize the repair by combining stochastic Petri nets, alignments, and Bayesian networks. We evaluate the results using both synthetic data and real event data from a Dutch hospital.}, language = {en} } @book{OPUS4-6813, title = {Cloud security mechanisms}, number = {87}, editor = {Neuhaus, Christian and Polze, Andreas}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-281-0}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-68168}, publisher = {Universit{\"a}t Potsdam}, pages = {78}, year = {2014}, abstract = {Cloud computing has brought great benefits in cost and flexibility for provisioning services. The greatest challenge of cloud computing remains however the question of security. The current standard tools in access control mechanisms and cryptography can only partly solve the security challenges of cloud infrastructures. In the recent years of research in security and cryptography, novel mechanisms, protocols and algorithms have emerged that offer new ways to create secure services atop cloud infrastructures. This report provides introductions to a selection of security mechanisms that were part of the "Cloud Security Mechanisms" seminar in summer term 2013 at HPI.}, language = {en} } @phdthesis{Kunze2013, author = {Kunze, Matthias}, title = {Searching business process models by example}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-68844}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {Business processes are fundamental to the operations of a company. Each product manufactured and every service provided is the result of a series of actions that constitute a business process. Business process management is an organizational principle that makes the processes of a company explicit and offers capabilities to implement procedures, control their execution, analyze their performance, and improve them. Therefore, business processes are documented as process models that capture these actions and their execution ordering, and make them accessible to stakeholders. As these models are an essential knowledge asset, they need to be managed effectively. In particular, the discovery and reuse of existing knowledge becomes challenging in the light of companies maintaining hundreds and thousands of process models. In practice, searching process models has been solved only superficially by means of free-text search of process names and their descriptions. Scientific contributions are limited in their scope, as they either present measures for process similarity or elaborate on query languages to search for particular aspects. However, they fall short in addressing efficient search, the presentation of search results, and the support to reuse discovered models. This thesis presents a novel search method, where a query is expressed by an exemplary business process model that describes the behavior of a possible answer. This method builds upon a formal framework that captures and compares the behavior of process models by the execution ordering of actions. The framework contributes a conceptual notion of behavioral distance that quantifies commonalities and differences of a pair of process models, and enables process model search. Based on behavioral distances, a set of measures is proposed that evaluate the quality of a particular search result to guide the user in assessing the returned matches. A projection of behavioral aspects to a process model enables highlighting relevant fragments that led to a match and facilitates its reuse. The thesis further elaborates on two search techniques that provide concrete behavioral distance functions as an instantiation of the formal framework. Querying enables search with a notion of behavioral inclusion with regard to the query. In contrast, similarity search obtains process models that are similar to a query, even if the query is not precisely matched. For both techniques, indexes are presented that enable efficient search. Methods to evaluate the quality and performance of process model search are introduced and applied to the techniques of this thesis. They show good results with regard to human assessment and scalability in a practical setting.}, language = {en} } @book{PapeTrefferHirschfeldetal.2013, author = {Pape, Tobias and Treffer, Arian and Hirschfeld, Robert and Haupt, Michael}, title = {Extending a Java Virtual Machine to Dynamic Object-oriented Languages}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-266-7}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-67438}, publisher = {Universit{\"a}t Potsdam}, pages = {163}, year = {2013}, abstract = {There are two common approaches to implement a virtual machine (VM) for a dynamic object-oriented language. On the one hand, it can be implemented in a C-like language for best performance and maximum control over the resulting executable. On the other hand, it can be implemented in a language such as Java that allows for higher-level abstractions. These abstractions, such as proper object-oriented modularization, automatic memory management, or interfaces, are missing in C-like languages but they can simplify the implementation of prevalent but complex concepts in VMs, such as garbage collectors (GCs) or just-in-time compilers (JITs). Yet, the implementation of a dynamic object-oriented language in Java eventually results in two VMs on top of each other (double stack), which impedes performance. For statically typed languages, the Maxine VM solves this problem; it is written in Java but can be executed without a Java virtual machine (JVM). However, it is currently not possible to execute dynamic object-oriented languages in Maxine. This work presents an approach to bringing object models and execution models of dynamic object-oriented languages to the Maxine VM and the application of this approach to Squeak/Smalltalk. The representation of objects in and the execution of dynamic object-oriented languages pose certain challenges to the Maxine VM that lacks certain variation points necessary to enable an effortless and straightforward implementation of dynamic object-oriented languages' execution models. The implementation of Squeak/Smalltalk in Maxine as a feasibility study is to unveil such missing variation points.}, language = {en} } @phdthesis{Glander2012, author = {Glander, Tassilo}, title = {Multi-scale representations of virtual 3D city models}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-64117}, school = {Universit{\"a}t Potsdam}, year = {2012}, abstract = {Virtual 3D city and landscape models are the main subject investigated in this thesis. They digitally represent urban space and have many applications in different domains, e.g., simulation, cadastral management, and city planning. Visualization is an elementary component of these applications. Photo-realistic visualization with an increasingly high degree of detail leads to fundamental problems for comprehensible visualization. A large number of highly detailed and textured objects within a virtual 3D city model may create visual noise and overload the users with information. Objects are subject to perspective foreshortening and may be occluded or not displayed in a meaningful way, as they are too small. In this thesis we present abstraction techniques that automatically process virtual 3D city and landscape models to derive abstracted representations. These have a reduced degree of detail, while essential characteristics are preserved. After introducing definitions for model, scale, and multi-scale representations, we discuss the fundamentals of map generalization as well as techniques for 3D generalization. The first presented technique is a cell-based generalization of virtual 3D city models. It creates abstract representations that have a highly reduced level of detail while maintaining essential structures, e.g., the infrastructure network, landmark buildings, and free spaces. The technique automatically partitions the input virtual 3D city model into cells based on the infrastructure network. The single building models contained in each cell are aggregated to abstracted cell blocks. Using weighted infrastructure elements, cell blocks can be computed on different hierarchical levels, storing the hierarchy relation between the cell blocks. Furthermore, we identify initial landmark buildings within a cell by comparing the properties of individual buildings with the aggregated properties of the cell. For each block, the identified landmark building models are subtracted using Boolean operations and integrated in a photo-realistic way. Finally, for the interactive 3D visualization we discuss the creation of the virtual 3D geometry and their appearance styling through colors, labeling, and transparency. We demonstrate the technique with example data sets. Additionally, we discuss applications of generalization lenses and transitions between abstract representations. The second technique is a real-time-rendering technique for geometric enhancement of landmark objects within a virtual 3D city model. Depending on the virtual camera distance, landmark objects are scaled to ensure their visibility within a specific distance interval while deforming their environment. First, in a preprocessing step a landmark hierarchy is computed, this is then used to derive distance intervals for the interactive rendering. At runtime, using the virtual camera distance, a scaling factor is computed and applied to each landmark. The scaling factor is interpolated smoothly at the interval boundaries using cubic B{\´e}zier splines. Non-landmark geometry that is near landmark objects is deformed with respect to a limited number of landmarks. We demonstrate the technique by applying it to a highly detailed virtual 3D city model and a generalized 3D city model. In addition we discuss an adaptation of the technique for non-linear projections and mobile devices. The third technique is a real-time rendering technique to create abstract 3D isocontour visualization of virtual 3D terrain models. The virtual 3D terrain model is visualized as a layered or stepped relief. The technique works without preprocessing and, as it is implemented using programmable graphics hardware, can be integrated with minimal changes into common terrain rendering techniques. Consequently, the computation is done in the rendering pipeline for each vertex, primitive, i.e., triangle, and fragment. For each vertex, the height is quantized to the nearest isovalue. For each triangle, the vertex configuration with respect to their isovalues is determined first. Using the configuration, the triangle is then subdivided. The subdivision forms a partial step geometry aligned with the triangle. For each fragment, the surface appearance is determined, e.g., depending on the surface texture, shading, and height-color-mapping. Flexible usage of the technique is demonstrated with applications from focus+context visualization, out-of-core terrain rendering, and information visualization. This thesis presents components for the creation of abstract representations of virtual 3D city and landscape models. Re-using visual language from cartography, the techniques enable users to build on their experience with maps when interpreting these representations. Simultaneously, characteristics of 3D geovirtual environments are taken into account by addressing and discussing, e.g., continuous scale, interaction, and perspective.}, language = {en} } @phdthesis{Seibel2012, author = {Seibel, Andreas}, title = {Traceability and model management with executable and dynamic hierarchical megamodels}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-64222}, school = {Universit{\"a}t Potsdam}, year = {2012}, abstract = {Nowadays, model-driven engineering (MDE) promises to ease software development by decreasing the inherent complexity of classical software development. In order to deliver on this promise, MDE increases the level of abstraction and automation, through a consideration of domain-specific models (DSMs) and model operations (e.g. model transformations or code generations). DSMs conform to domain-specific modeling languages (DSMLs), which increase the level of abstraction, and model operations are first-class entities of software development because they increase the level of automation. Nevertheless, MDE has to deal with at least two new dimensions of complexity, which are basically caused by the increased linguistic and technological heterogeneity. The first dimension of complexity is setting up an MDE environment, an activity comprised of the implementation or selection of DSMLs and model operations. Setting up an MDE environment is both time-consuming and error-prone because of the implementation or adaptation of model operations. The second dimension of complexity is concerned with applying MDE for actual software development. Applying MDE is challenging because a collection of DSMs, which conform to potentially heterogeneous DSMLs, are required to completely specify a complex software system. A single DSML can only be used to describe a specific aspect of a software system at a certain level of abstraction and from a certain perspective. Additionally, DSMs are usually not independent but instead have inherent interdependencies, reflecting (partial) similar aspects of a software system at different levels of abstraction or from different perspectives. A subset of these dependencies are applications of various model operations, which are necessary to keep the degree of automation high. This becomes even worse when addressing the first dimension of complexity. Due to continuous changes, all kinds of dependencies, including the applications of model operations, must also be managed continuously. This comprises maintaining the existence of these dependencies and the appropriate (re-)application of model operations. The contribution of this thesis is an approach that combines traceability and model management to address the aforementioned challenges of configuring and applying MDE for software development. The approach is considered as a traceability approach because it supports capturing and automatically maintaining dependencies between DSMs. The approach is considered as a model management approach because it supports managing the automated (re-)application of heterogeneous model operations. In addition, the approach is considered as a comprehensive model management. Since the decomposition of model operations is encouraged to alleviate the first dimension of complexity, the subsequent composition of model operations is required to counteract their fragmentation. A significant portion of this thesis concerns itself with providing a method for the specification of decoupled yet still highly cohesive complex compositions of heterogeneous model operations. The approach supports two different kinds of compositions - data-flow compositions and context compositions. Data-flow composition is used to define a network of heterogeneous model operations coupled by sharing input and output DSMs alone. Context composition is related to a concept used in declarative model transformation approaches to compose individual model transformation rules (units) at any level of detail. In this thesis, context composition provides the ability to use a collection of dependencies as context for the composition of other dependencies, including model operations. In addition, the actual implementation of model operations, which are going to be composed, do not need to implement any composition concerns. The approach is realized by means of a formalism called an executable and dynamic hierarchical megamodel, based on the original idea of megamodels. This formalism supports specifying compositions of dependencies (traceability and model operations). On top of this formalism, traceability is realized by means of a localization concept, and model management by means of an execution concept.}, language = {en} } @book{SchwalbKruegerPlattner2013, author = {Schwalb, David and Kr{\"u}ger, Jens and Plattner, Hasso}, title = {Cache conscious column organization in in-memory column stores}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-228-5}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-63890}, publisher = {Universit{\"a}t Potsdam}, pages = {v, 84}, year = {2013}, abstract = {Cost models are an essential part of database systems, as they are the basis of query performance optimization. Based on predictions made by cost models, the fastest query execution plan can be chosen and executed or algorithms can be tuned and optimised. In-memory databases shifts the focus from disk to main memory accesses and CPU costs, compared to disk based systems where input and output costs dominate the overall costs and other processing costs are often neglected. However, modelling memory accesses is fundamentally different and common models do not apply anymore. This work presents a detailed parameter evaluation for the plan operators scan with equality selection, scan with range selection, positional lookup and insert in in-memory column stores. Based on this evaluation, a cost model based on cache misses for estimating the runtime of the considered plan operators using different data structures is developed. Considered are uncompressed columns, bit compressed and dictionary encoded columns with sorted and unsorted dictionaries. Furthermore, tree indices on the columns and dictionaries are discussed. Finally, partitioned columns consisting of one partition with a sorted and one with an unsorted dictionary are investigated. New values are inserted in the unsorted dictionary partition and moved periodically by a merge process to the sorted partition. An efficient attribute merge algorithm is described, supporting the update performance required to run enterprise applications on read-optimised databases. Further, a memory traffic based cost model for the merge process is provided.}, language = {en} } @book{VogelGiese2013, author = {Vogel, Thomas and Giese, Holger}, title = {Model-driven engineering of adaptation engines for self-adaptive software : executable runtime megamodels}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-227-8}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-63825}, publisher = {Universit{\"a}t Potsdam}, pages = {vi, 59}, year = {2013}, abstract = {The development of self-adaptive software requires the engineering of an adaptation engine that controls and adapts the underlying adaptable software by means of feedback loops. The adaptation engine often describes the adaptation by using runtime models representing relevant aspects of the adaptable software and particular activities such as analysis and planning that operate on these runtime models. To systematically address the interplay between runtime models and adaptation activities in adaptation engines, runtime megamodels have been proposed for self-adaptive software. A runtime megamodel is a specific runtime model whose elements are runtime models and adaptation activities. Thus, a megamodel captures the interplay between multiple models and between models and activities as well as the activation of the activities. In this article, we go one step further and present a modeling language for ExecUtable RuntimE MegAmodels (EUREMA) that considerably eases the development of adaptation engines by following a model-driven engineering approach. We provide a domain-specific modeling language and a runtime interpreter for adaptation engines, in particular for feedback loops. Megamodels are kept explicit and alive at runtime and by interpreting them, they are directly executed to run feedback loops. Additionally, they can be dynamically adjusted to adapt feedback loops. Thus, EUREMA supports development by making feedback loops, their runtime models, and adaptation activities explicit at a higher level of abstraction. Moreover, it enables complex solutions where multiple feedback loops interact or even operate on top of each other. Finally, it leverages the co-existence of self-adaptation and off-line adaptation for evolution.}, language = {en} } @book{BeyhlBlouinGieseetal.2016, author = {Beyhl, Thomas and Blouin, Dominique and Giese, Holger and Lambers, Leen}, title = {On the operationalization of graph queries with generalized discrimination networks}, number = {106}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-372-5}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-96279}, publisher = {Universit{\"a}t Potsdam}, pages = {33}, year = {2016}, abstract = {Graph queries have lately gained increased interest due to application areas such as social networks, biological networks, or model queries. For the relational database case the relational algebra and generalized discrimination networks have been studied to find appropriate decompositions into subqueries and ordering of these subqueries for query evaluation or incremental updates of query results. For graph database queries however there is no formal underpinning yet that allows us to find such suitable operationalizations. Consequently, we suggest a simple operational concept for the decomposition of arbitrary complex queries into simpler subqueries and the ordering of these subqueries in form of generalized discrimination networks for graph queries inspired by the relational case. The approach employs graph transformation rules for the nodes of the network and thus we can employ the underlying theory. We further show that the proposed generalized discrimination networks have the same expressive power as nested graph conditions.}, language = {en} } @unpublished{GrapentinHeidlerKorschetal.2014, author = {Grapentin, Andreas and Heidler, Kirstin and Korsch, Dimitri and Kumar Sah, Rakesh and Kunzmann, Nicco and Henning, Johannes and Mattis, Toni and Rein, Patrick and Seckler, Eric and Groneberg, Bj{\"o}rn and Zimmermann, Florian}, title = {Embedded operating system projects}, number = {90}, editor = {Hentschel, Uwe and Richter, Daniel and Polze, Andreas}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-296-4}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-69154}, pages = {xi, 87}, year = {2014}, abstract = {In today's life, embedded systems are ubiquitous. But they differ from traditional desktop systems in many aspects - these include predictable timing behavior (real-time), the management of scarce resources (memory, network), reliable communication protocols, energy management, special purpose user-interfaces (headless operation), system configuration, programming languages (to support software/hardware co-design), and modeling techniques. Within this technical report, authors present results from the lecture "Operating Systems for Embedded Computing" that has been offered by the "Operating Systems and Middleware" group at HPI in Winter term 2013/14. Focus of the lecture and accompanying projects was on principles of real-time computing. Students had the chance to gather practical experience with a number of different OSes and applications and present experiences with near-hardware programming. Projects address the entire spectrum, from bare-metal programming to harnessing a real-time OS to exercising the full software/hardware co-design cycle. Three outstanding projects are at the heart of this technical report. Project 1 focuses on the development of a bare-metal operating system for LEGO Mindstorms EV3. While still a toy, it comes with a powerful ARM processor, 64 MB of main memory, standard interfaces, such as Bluetooth and network protocol stacks. EV3 runs a version of 1 1 Introduction Linux. Sources are available from Lego's web site. However, many devices and their driver software are proprietary and not well documented. Developing a new, bare-metal OS for the EV3 requires an understanding of the EV3 boot process. Since no standard input/output devices are available, initial debugging steps are tedious. After managing these initial steps, the project was able to adapt device drivers for a few Lego devices to an extent that a demonstrator (the Segway application) could be successfully run on the new OS. Project 2 looks at the EV3 from a different angle. The EV3 is running a pretty decent version of Linux- in principle, the RT_PREEMPT patch can turn any Linux system into a real-time OS by modifying the behavior of a number of synchronization constructs at the heart of the OS. Priority inversion is a problem that is solved by protocols such as priority inheritance or priority ceiling. Real-time OSes implement at least one of the protocols. The central idea of the project was the comparison of non-real-time and real-time variants of Linux on the EV3 hardware. A task set that showed effects of priority inversion on standard EV3 Linux would operate flawlessly on the Linux version with the RT_PREEMPT-patch applied. If only patching Lego's version of Linux was that easy... Project 3 takes the notion of real-time computing more seriously. The application scenario was centered around our Carrera Digital 132 racetrack. Obtaining position information from the track, controlling individual cars, detecting and modifying the Carrera Digital protocol required design and implementation of custom controller hardware. What to implement in hardware, firmware, and what to implement in application software - this was the central question addressed by the project.}, language = {en} } @phdthesis{Polyvyanyy2012, author = {Polyvyanyy, Artem}, title = {Structuring process models}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-59024}, school = {Universit{\"a}t Potsdam}, year = {2012}, abstract = {One can fairly adopt the ideas of Donald E. Knuth to conclude that process modeling is both a science and an art. Process modeling does have an aesthetic sense. Similar to composing an opera or writing a novel, process modeling is carried out by humans who undergo creative practices when engineering a process model. Therefore, the very same process can be modeled in a myriad number of ways. Once modeled, processes can be analyzed by employing scientific methods. Usually, process models are formalized as directed graphs, with nodes representing tasks and decisions, and directed arcs describing temporal constraints between the nodes. Common process definition languages, such as Business Process Model and Notation (BPMN) and Event-driven Process Chain (EPC) allow process analysts to define models with arbitrary complex topologies. The absence of structural constraints supports creativity and productivity, as there is no need to force ideas into a limited amount of available structural patterns. Nevertheless, it is often preferable that models follow certain structural rules. A well-known structural property of process models is (well-)structuredness. A process model is (well-)structured if and only if every node with multiple outgoing arcs (a split) has a corresponding node with multiple incoming arcs (a join), and vice versa, such that the set of nodes between the split and the join induces a single-entry-single-exit (SESE) region; otherwise the process model is unstructured. The motivations for well-structured process models are manifold: (i) Well-structured process models are easier to layout for visual representation as their formalizations are planar graphs. (ii) Well-structured process models are easier to comprehend by humans. (iii) Well-structured process models tend to have fewer errors than unstructured ones and it is less probable to introduce new errors when modifying a well-structured process model. (iv) Well-structured process models are better suited for analysis with many existing formal techniques applicable only for well-structured process models. (v) Well-structured process models are better suited for efficient execution and optimization, e.g., when discovering independent regions of a process model that can be executed concurrently. Consequently, there are process modeling languages that encourage well-structured modeling, e.g., Business Process Execution Language (BPEL) and ADEPT. However, the well-structured process modeling implies some limitations: (i) There exist processes that cannot be formalized as well-structured process models. (ii) There exist processes that when formalized as well-structured process models require a considerable duplication of modeling constructs. Rather than expecting well-structured modeling from start, we advocate for the absence of structural constraints when modeling. Afterwards, automated methods can suggest, upon request and whenever possible, alternative formalizations that are "better" structured, preferably well-structured. In this thesis, we study the problem of automatically transforming process models into equivalent well-structured models. The developed transformations are performed under a strong notion of behavioral equivalence which preserves concurrency. The findings are implemented in a tool, which is publicly available.}, language = {en} } @article{KaitouaRablMarkl2020, author = {Kaitoua, Abdulrahman and Rabl, Tilmann and Markl, Volker}, title = {A distributed data exchange engine for polystores}, series = {Information technology : methods and applications of informatics and information technology}, volume = {62}, journal = {Information technology : methods and applications of informatics and information technology}, number = {3-4}, publisher = {De Gruyter}, address = {Berlin}, issn = {1611-2776}, doi = {10.1515/itit-2019-0037}, pages = {145 -- 156}, year = {2020}, abstract = {There is an increasing interest in fusing data from heterogeneous sources. Combining data sources increases the utility of existing datasets, generating new information and creating services of higher quality. A central issue in working with heterogeneous sources is data migration: In order to share and process data in different engines, resource intensive and complex movements and transformations between computing engines, services, and stores are necessary. Muses is a distributed, high-performance data migration engine that is able to interconnect distributed data stores by forwarding, transforming, repartitioning, or broadcasting data among distributed engines' instances in a resource-, cost-, and performance-adaptive manner. As such, it performs seamless information sharing across all participating resources in a standard, modular manner. We show an overall improvement of 30 \% for pipelining jobs across multiple engines, even when we count the overhead of Muses in the execution time. This performance gain implies that Muses can be used to optimise large pipelines that leverage multiple engines.}, language = {en} } @article{DreselerBoissierRabletal.2020, author = {Dreseler, Markus and Boissier, Martin and Rabl, Tilmann and Uflacker, Matthias}, title = {Quantifying TPC-H choke points and their optimizations}, series = {Proceedings of the VLDB Endowment}, volume = {13}, journal = {Proceedings of the VLDB Endowment}, number = {8}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3389133.3389138}, pages = {1206 -- 1220}, year = {2020}, abstract = {TPC-H continues to be the most widely used benchmark for relational OLAP systems. It poses a number of challenges, also known as "choke points", which database systems have to solve in order to achieve good benchmark results. Examples include joins across multiple tables, correlated subqueries, and correlations within the TPC-H data set. Knowing the impact of such optimizations helps in developing optimizers as well as in interpreting TPC-H results across database systems. This paper provides a systematic analysis of choke points and their optimizations. It complements previous work on TPC-H choke points by providing a quantitative discussion of their relevance. It focuses on eleven choke points where the optimizations are beneficial independently of the database system. Of these, the flattening of subqueries and the placement of predicates have the biggest impact. Three queries (Q2, Q17, and Q21) are strongly ifluenced by the choice of an efficient query plan; three others (Q1, Q13, and Q18) are less influenced by plan optimizations and more dependent on an efficient execution engine.}, language = {en} } @article{BorchertMockTomczaketal.2021, author = {Borchert, Florian and Mock, Andreas and Tomczak, Aurelie and H{\"u}gel, Jonas and Alkarkoukly, Samer and Knurr, Alexander and Volckmar, Anna-Lena and Stenzinger, Albrecht and Schirmacher, Peter and Debus, J{\"u}rgen and J{\"a}ger, Dirk and Longerich, Thomas and Fr{\"o}hling, Stefan and Eils, Roland and Bougatf, Nina and Sax, Ulrich and Schapranow, Matthieu-Patrick}, title = {Correction to: Knowledge bases and software support for variant interpretation in precision oncology}, series = {Briefings in bioinformatics}, volume = {22}, journal = {Briefings in bioinformatics}, number = {6}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1467-5463}, doi = {10.1093/bib/bbab246}, pages = {1}, year = {2021}, language = {en} } @article{RoostapourNeumannNeumannetal.2022, author = {Roostapour, Vahid and Neumann, Aneta and Neumann, Frank and Friedrich, Tobias}, title = {Pareto optimization for subset selection with dynamic cost constraints}, series = {Artificial intelligence}, volume = {302}, journal = {Artificial intelligence}, publisher = {Elsevier}, address = {Amsterdam}, issn = {0004-3702}, doi = {10.1016/j.artint.2021.103597}, pages = {17}, year = {2022}, abstract = {We consider the subset selection problem for function f with constraint bound B that changes over time. Within the area of submodular optimization, various greedy approaches are commonly used. For dynamic environments we observe that the adaptive variants of these greedy approaches are not able to maintain their approximation quality. Investigating the recently introduced POMC Pareto optimization approach, we show that this algorithm efficiently computes a phi=(alpha(f)/2)(1 - 1/e(alpha)f)-approximation, where alpha(f) is the submodularity ratio of f, for each possible constraint bound b <= B. Furthermore, we show that POMC is able to adapt its set of solutions quickly in the case that B increases. Our experimental investigations for the influence maximization in social networks show the advantage of POMC over generalized greedy algorithms. We also consider EAMC, a new evolutionary algorithm with polynomial expected time guarantee to maintain phi approximation ratio, and NSGA-II with two different population sizes as advanced multi-objective optimization algorithm, to demonstrate their challenges in optimizing the maximum coverage problem. Our empirical analysis shows that, within the same number of evaluations, POMC is able to perform as good as NSGA-II under linear constraint, while EAMC performs significantly worse than all considered algorithms in most cases.}, language = {en} } @book{OPUS4-10026, title = {Proceedings of the 10th Ph.D. Retreat of the HPI Research School on Service-oriented Systems Engineering}, number = {111}, editor = {Meinel, Christoph and Plattner, Hasso and D{\"o}llner, J{\"u}rgen Roland Friedrich and Weske, Mathias and Polze, Andreas and Hirschfeld, Robert and Naumann, Felix and Giese, Holger and Baudisch, Patrick and Friedrich, Tobias and M{\"u}ller, Emmanuel}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-390-9}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-100260}, publisher = {Universit{\"a}t Potsdam}, pages = {vi, 255}, year = {2016}, abstract = {Design and Implementation of service-oriented architectures imposes a huge number of research questions from the fields of software engineering, system analysis and modeling, adaptability, and application integration. Component orientation and web services are two approaches for design and realization of complex web-based system. Both approaches allow for dynamic application adaptation as well as integration of enterprise application. Commonly used technologies, such as J2EE and .NET, form de facto standards for the realization of complex distributed systems. Evolution of component systems has lead to web services and service-based architectures. This has been manifested in a multitude of industry standards and initiatives such as XML, WSDL UDDI, SOAP, etc. All these achievements lead to a new and promising paradigm in IT systems engineering which proposes to design complex software solutions as collaboration of contractually defined software services. Service-Oriented Systems Engineering represents a symbiosis of best practices in object-orientation, component-based development, distributed computing, and business process management. It provides integration of business and IT concerns. The annual Ph.D. Retreat of the Research School provides each member the opportunity to present his/her current state of their research and to give an outline of a prospective Ph.D. thesis. Due to the interdisciplinary structure of the research school, this technical report covers a wide range of topics. These include but are not limited to: Human Computer Interaction and Computer Vision as Service; Service-oriented Geovisualization Systems; Algorithm Engineering for Service-oriented Systems; Modeling and Verification of Self-adaptive Service-oriented Systems; Tools and Methods for Software Engineering in Service-oriented Systems; Security Engineering of Service-based IT Systems; Service-oriented Information Systems; Evolutionary Transition of Enterprise Applications to Service Orientation; Operating System Abstractions for Service-oriented Computing; and Services Specification, Composition, and Enactment.}, language = {en} } @phdthesis{Takouna2014, author = {Takouna, Ibrahim}, title = {Energy-efficient and performance-aware virtual machine management for cloud data centers}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-72399}, school = {Universit{\"a}t Potsdam}, year = {2014}, abstract = {Virtualisierte Cloud Datenzentren stellen nach Bedarf Ressourcen zur Verf{\"u}gu-ng, erm{\"o}glichen agile Ressourcenbereitstellung und beherbergen heterogene Applikationen mit verschiedenen Anforderungen an Ressourcen. Solche Datenzentren verbrauchen enorme Mengen an Energie, was die Erh{\"o}hung der Betriebskosten, der W{\"a}rme innerhalb der Zentren und des Kohlendioxidausstoßes verursacht. Der Anstieg des Energieverbrauches kann durch ein ineffektives Ressourcenmanagement, das die ineffiziente Ressourcenausnutzung verursacht, entstehen. Die vorliegende Dissertation stellt detaillierte Modelle und neue Verfahren f{\"u}r virtualisiertes Ressourcenmanagement in Cloud Datenzentren vor. Die vorgestellten Verfahren ziehen das Service-Level-Agreement (SLA) und die Heterogenit{\"a}t der Auslastung bez{\"u}glich des Bedarfs an Speicherzugriffen und Kommunikationsmustern von Web- und HPC- (High Performance Computing) Applikationen in Betracht. Um die pr{\"a}sentierten Techniken zu evaluieren, verwenden wir Simulationen und echte Protokollierung der Auslastungen von Web- und HPC- Applikationen. Außerdem vergleichen wir unser Techniken und Verfahren mit anderen aktuellen Verfahren durch die Anwendung von verschiedenen Performance Metriken. Die Hauptbeitr{\"a}ge dieser Dissertation sind Folgendes: Ein Proaktives auf robuster Optimierung basierendes Ressourcenbereitstellungsverfahren. Dieses Verfahren erh{\"o}ht die F{\"a}higkeit der Hostes zur Verf{\"u}g-ungsstellung von mehr VMs. Gleichzeitig aber wird der unn{\"o}tige Energieverbrauch minimiert. Zus{\"a}tzlich mindert diese Technik unerw{\"u}nschte {\"A}nde-rungen im Energiezustand des Servers. Die vorgestellte Technik nutzt einen auf Intervall basierenden Vorhersagealgorithmus zur Implementierung einer robusten Optimierung. Dabei werden unsichere Anforderungen in Betracht gezogen. Ein adaptives und auf Intervall basierendes Verfahren zur Vorhersage des Arbeitsaufkommens mit hohen, in k{\"u}rzer Zeit auftretenden Schwankungen. Die Intervall basierende Vorhersage ist implementiert in der Standard Abweichung Variante und in der Median absoluter Abweichung Variante. Die Intervall-{\"A}nderungen basieren auf einem adaptiven Vertrauensfenster um die Schwankungen des Arbeitsaufkommens zu bew{\"a}ltigen. Eine robuste VM Zusammenlegung f{\"u}r ein effizientes Energie und Performance Management. Dies erm{\"o}glicht die gegenseitige Abh{\"a}ngigkeit zwischen der Energie und der Performance zu minimieren. Unser Verfahren reduziert die Anzahl der VM-Migrationen im Vergleich mit den neu vor kurzem vorgestellten Verfahren. Dies tr{\"a}gt auch zur Reduzierung des durch das Netzwerk verursachten Energieverbrauches. Außerdem reduziert dieses Verfahren SLA-Verletzungen und die Anzahl von {\"A}nderungen an Energiezus-t{\"a}nden. Ein generisches Modell f{\"u}r das Netzwerk eines Datenzentrums um die verz{\"o}-gerte Kommunikation und ihre Auswirkung auf die VM Performance und auf die Netzwerkenergie zu simulieren. Außerdem wird ein generisches Modell f{\"u}r ein Memory-Bus des Servers vorgestellt. Dieses Modell beinhaltet auch Modelle f{\"u}r die Latenzzeit und den Energieverbrauch f{\"u}r verschiedene Memory Frequenzen. Dies erlaubt eine Simulation der Memory Verz{\"o}gerung und ihre Auswirkung auf die VM-Performance und auf den Memory Energieverbrauch. Kommunikation bewusste und Energie effiziente Zusammenlegung f{\"u}r parallele Applikationen um die dynamische Entdeckung von Kommunikationsmustern und das Umplanen von VMs zu erm{\"o}glichen. Das Umplanen von VMs benutzt eine auf den entdeckten Kommunikationsmustern basierende Migration. Eine neue Technik zur Entdeckung von dynamischen Mustern ist implementiert. Sie basiert auf der Signal Verarbeitung des Netzwerks von VMs, anstatt die Informationen des virtuellen Umstellung der Hosts oder der Initiierung der VMs zu nutzen. Das Ergebnis zeigt, dass unsere Methode die durchschnittliche Anwendung des Netzwerks reduziert und aufgrund der Reduzierung der aktiven Umstellungen Energie gespart. Außerdem bietet sie eine bessere VM Performance im Vergleich zu der CPU-basierten Platzierung. Memory bewusste VM Zusammenlegung f{\"u}r unabh{\"a}ngige VMs. Sie nutzt die Vielfalt des VMs Memory Zuganges um die Anwendung vom Memory-Bus der Hosts zu balancieren. Die vorgestellte Technik, Memory-Bus Load Balancing (MLB), verteilt die VMs reaktiv neu im Bezug auf ihre Anwendung vom Memory-Bus. Sie nutzt die VM Migration um die Performance des gesamtem Systems zu verbessern. Außerdem sind die dynamische Spannung, die Frequenz Skalierung des Memory und die MLB Methode kombiniert um ein besseres Energiesparen zu leisten.}, language = {en} } @phdthesis{Steinmetz2013, author = {Steinmetz, Nadine}, title = {Context-aware semantic analysis of video metadata}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-70551}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {Im Vergleich zu einer stichwortbasierten Suche erm{\"o}glicht die semantische Suche ein pr{\"a}ziseres und anspruchsvolleres Durchsuchen von (Web)-Dokumenten, weil durch die explizite Semantik Mehrdeutigkeiten von nat{\"u}rlicher Sprache vermieden und semantische Beziehungen in das Suchergebnis einbezogen werden k{\"o}nnen. Eine semantische, Entit{\"a}ten-basierte Suche geht von einer Anfrage mit festgelegter Bedeutung aus und liefert nur Dokumente, die mit dieser Entit{\"a}t annotiert sind als Suchergebnis. Die wichtigste Voraussetzung f{\"u}r eine Entit{\"a}ten-zentrierte Suche stellt die Annotation der Dokumente im Archiv mit Entit{\"a}ten und Kategorien dar. Textuelle Informationen werden analysiert und mit den entsprechenden Entit{\"a}ten und Kategorien versehen, um den Inhalt semantisch erschließen zu k{\"o}nnen. Eine manuelle Annotation erfordert Dom{\"a}nenwissen und ist sehr zeitaufwendig. Die semantische Annotation von Videodokumenten erfordert besondere Aufmerksamkeit, da inhaltsbasierte Metadaten von Videos aus verschiedenen Quellen stammen, verschiedene Eigenschaften und Zuverl{\"a}ssigkeiten besitzen und daher nicht wie Fließtext behandelt werden k{\"o}nnen. Die vorliegende Arbeit stellt einen semantischen Analyseprozess f{\"u}r Video-Metadaten vor. Die Eigenschaften der verschiedenen Metadatentypen werden analysiert und ein Konfidenzwert ermittelt. Dieser Wert spiegelt die Korrektheit und die wahrscheinliche Mehrdeutigkeit eines Metadatums wieder. Beginnend mit dem Metadatum mit dem h{\"o}chsten Konfidenzwert wird der Analyseprozess innerhalb eines Kontexts in absteigender Reihenfolge des Konfidenzwerts durchgef{\"u}hrt. Die bereits analysierten Metadaten dienen als Referenzpunkt f{\"u}r die weiteren Analysen. So kann eine m{\"o}glichst korrekte Analyse der heterogen strukturierten Daten eines Kontexts sichergestellt werden. Am Ende der Analyse eines Metadatums wird die f{\"u}r den Kontext relevanteste Entit{\"a}t aus einer Liste von Kandidaten identifiziert - das Metadatum wird disambiguiert. Hierf{\"u}r wurden verschiedene Disambiguierungsalgorithmen entwickelt, die Beschreibungstexte und semantische Beziehungen der Entit{\"a}tenkandidaten zum gegebenen Kontext in Betracht ziehen. Der Kontext f{\"u}r die Disambiguierung wird f{\"u}r jedes Metadatum anhand der Eigenschaften und Konfidenzwerte zusammengestellt. Der vorgestellte Analyseprozess ist an zwei Hypothesen angelehnt: Um die Analyseergebnisse verbessern zu k{\"o}nnen, sollten die Metadaten eines Kontexts in absteigender Reihenfolge ihres Konfidenzwertes verarbeitet werden und die Kontextgrenzen von Videometadaten sollten durch Segmentgrenzen definiert werden, um m{\"o}glichst Kontexte mit koh{\"a}rentem Inhalt zu erhalten. Durch ausf{\"u}hrliche Evaluationen konnten die gestellten Hypothesen best{\"a}tigt werden. Der Analyseprozess wurden gegen mehrere State-of-the-Art Methoden verglichen und erzielt verbesserte Ergebnisse in Bezug auf Recall und Precision, besonders f{\"u}r Metadaten, die aus weniger zuverl{\"a}ssigen Quellen stammen. Der Analyseprozess ist Teil eines Videoanalyse-Frameworks und wurde bereits erfolgreich in verschiedenen Projekten eingesetzt.}, language = {en} } @phdthesis{Wang2011, author = {Wang, Long}, title = {X-tracking the usage interest on web sites}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-51077}, school = {Universit{\"a}t Potsdam}, year = {2011}, abstract = {The exponential expanding of the numbers of web sites and Internet users makes WWW the most important global information resource. From information publishing and electronic commerce to entertainment and social networking, the Web allows an inexpensive and efficient access to the services provided by individuals and institutions. The basic units for distributing these services are the web sites scattered throughout the world. However, the extreme fragility of web services and content, the high competence between similar services supplied by different sites, and the wide geographic distributions of the web users drive the urgent requirement from the web managers to track and understand the usage interest of their web customers. This thesis, "X-tracking the Usage Interest on Web Sites", aims to fulfill this requirement. "X" stands two meanings: one is that the usage interest differs from various web sites, and the other is that usage interest is depicted from multi aspects: internal and external, structural and conceptual, objective and subjective. "Tracking" shows that our concentration is on locating and measuring the differences and changes among usage patterns. This thesis presents the methodologies on discovering usage interest on three kinds of web sites: the public information portal site, e-learning site that provides kinds of streaming lectures and social site that supplies the public discussions on IT issues. On different sites, we concentrate on different issues related with mining usage interest. The educational information portal sites were the first implementation scenarios on discovering usage patterns and optimizing the organization of web services. In such cases, the usage patterns are modeled as frequent page sets, navigation paths, navigation structures or graphs. However, a necessary requirement is to rebuild the individual behaviors from usage history. We give a systematic study on how to rebuild individual behaviors. Besides, this thesis shows a new strategy on building content clusters based on pair browsing retrieved from usage logs. The difference between such clusters and the original web structure displays the distance between the destinations from usage side and the expectations from design side. Moreover, we study the problem on tracking the changes of usage patterns in their life cycles. The changes are described from internal side integrating conceptual and structure features, and from external side for the physical features; and described from local side measuring the difference between two time spans, and global side showing the change tendency along the life cycle. A platform, Web-Cares, is developed to discover the usage interest, to measure the difference between usage interest and site expectation and to track the changes of usage patterns. E-learning site provides the teaching materials such as slides, recorded lecture videos and exercise sheets. We focus on discovering the learning interest on streaming lectures, such as real medias, mp4 and flash clips. Compared to the information portal site, the usage on streaming lectures encapsulates the variables such as viewing time and actions during learning processes. The learning interest is discovered in the form of answering 6 questions, which covers finding the relations between pieces of lectures and the preference among different forms of lectures. We prefer on detecting the changes of learning interest on the same course from different semesters. The differences on the content and structure between two courses leverage the changes on the learning interest. We give an algorithm on measuring the difference on learning interest integrated with similarity comparison between courses. A search engine, TASK-Moniminer, is created to help the teacher query the learning interest on their streaming lectures on tele-TASK site. Social site acts as an online community attracting web users to discuss the common topics and share their interesting information. Compared to the public information portal site and e-learning web site, the rich interactions among users and web content bring the wider range of content quality, on the other hand, provide more possibilities to express and model usage interest. We propose a framework on finding and recommending high reputation articles in a social site. We observed that the reputation is classified into global and local categories; the quality of the articles having high reputation is related with the content features. Based on these observations, our framework is implemented firstly by finding the articles having global or local reputation, and secondly clustering articles based on their content relations, and then the articles are selected and recommended from each cluster based on their reputation ranks.}, language = {en} } @book{MeinelWillems2013, author = {Meinel, Christoph and Willems, Christian}, title = {openHPI : the MOOC offer at Hasso Plattner Institute}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-264-3}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-67176}, publisher = {Universit{\"a}t Potsdam}, pages = {21}, year = {2013}, abstract = {The new interactive online educational platform openHPI, (https://openHPI.de) from Hasso Plattner Institute (HPI), offers freely accessible courses at no charge for all who are interested in subjects in the field of information technology and computer science. Since 2011, "Massive Open Online Courses," called MOOCs for short, have been offered, first at Stanford University and then later at other U.S. elite universities. Following suit, openHPI provides instructional videos on the Internet and further reading material, combined with learning-supportive self-tests, homework and a social discussion forum. Education is further stimulated by the support of a virtual learning community. In contrast to "traditional" lecture platforms, such as the tele-TASK portal (http://www.tele-task.de) where multimedia recorded lectures are available on demand, openHPI offers didactic online courses. The courses have a fixed start date and offer a balanced schedule of six consecutive weeks presented in multimedia and, whenever possible, interactive learning material. Each week, one chapter of the course subject is treated. In addition, a series of learning videos, texts, self-tests and homework exercises are provided to course participants at the beginning of the week. The course offering is combined with a social discussion platform where participants have the opportunity to enter into an exchange with course instructors and fellow participants. Here, for example, they can get answers to questions and discuss the topics in depth. The participants naturally decide themselves about the type and range of their learning activities. They can make personal contributions to the course, for example, in blog posts or tweets, which they can refer to in the forum. In turn, other participants have the chance to comment on, discuss or expand on what has been said. In this way, the learners become the teachers and the subject matter offered to a virtual community is linked to a social learning network.}, language = {en} } @article{PfitznerSteckhanArnrich2021, author = {Pfitzner, Bjarne and Steckhan, Nico and Arnrich, Bert}, title = {Federated learning in a medical context}, series = {ACM transactions on internet technology : TOIT / Association for Computing}, volume = {21}, journal = {ACM transactions on internet technology : TOIT / Association for Computing}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1533-5399}, doi = {10.1145/3412357}, pages = {1 -- 31}, year = {2021}, abstract = {Data privacy is a very important issue. Especially in fields like medicine, it is paramount to abide by the existing privacy regulations to preserve patients' anonymity. However, data is required for research and training machine learning models that could help gain insight into complex correlations or personalised treatments that may otherwise stay undiscovered. Those models generally scale with the amount of data available, but the current situation often prohibits building large databases across sites. So it would be beneficial to be able to combine similar or related data from different sites all over the world while still preserving data privacy. Federated learning has been proposed as a solution for this, because it relies on the sharing of machine learning models, instead of the raw data itself. That means private data never leaves the site or device it was collected on. Federated learning is an emerging research area, and many domains have been identified for the application of those methods. This systematic literature review provides an extensive look at the concept of and research into federated learning and its applicability for confidential healthcare datasets.}, language = {en} } @article{GoebelLagodzinskiSeidel2021, author = {G{\"o}bel, Andreas and Lagodzinski, Julius Albert Gregor and Seidel, Karen}, title = {Counting homomorphisms to trees modulo a prime}, series = {ACM transactions on computation theory : TOCT / Association for Computing Machinery}, volume = {13}, journal = {ACM transactions on computation theory : TOCT / Association for Computing Machinery}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1942-3454}, doi = {10.1145/3460958}, pages = {1 -- 33}, year = {2021}, abstract = {Many important graph-theoretic notions can be encoded as counting graph homomorphism problems, such as partition functions in statistical physics, in particular independent sets and colourings. In this article, we study the complexity of \#(p) HOMSTOH, the problem of counting graph homomorphisms from an input graph to a graph H modulo a prime number p. Dyer and Greenhill proved a dichotomy stating that the tractability of non-modular counting graph homomorphisms depends on the structure of the target graph. Many intractable cases in non-modular counting become tractable in modular counting due to the common phenomenon of cancellation. In subsequent studies on counting modulo 2, however, the influence of the structure of H on the tractability was shown to persist, which yields similar dichotomies.
Our main result states that for every tree H and every prime p the problem \#pHOMSTOH is either polynomial time computable or \#P-p-complete. This relates to the conjecture of Faben and Jerrum stating that this dichotomy holds for every graph H when counting modulo 2. In contrast to previous results on modular counting, the tractable cases of \#pHOMSTOH are essentially the same for all values of the modulo when H is a tree. To prove this result, we study the structural properties of a homomorphism. As an important interim result, our study yields a dichotomy for the problem of counting weighted independent sets in a bipartite graph modulo some prime p. These results are the first suggesting that such dichotomies hold not only for the modulo 2 case but also for the modular counting functions of all primes p.}, language = {en} } @article{TalebRohrerBergneretal.2022, author = {Taleb, Aiham and Rohrer, Csaba and Bergner, Benjamin and De Leon, Guilherme and Rodrigues, Jonas Almeida and Schwendicke, Falk and Lippert, Christoph and Krois, Joachim}, title = {Self-supervised learning methods for label-efficient dental caries classification}, series = {Diagnostics : open access journal}, volume = {12}, journal = {Diagnostics : open access journal}, number = {5}, publisher = {MDPI}, address = {Basel}, issn = {2075-4418}, doi = {10.3390/diagnostics12051237}, pages = {15}, year = {2022}, abstract = {High annotation costs are a substantial bottleneck in applying deep learning architectures to clinically relevant use cases, substantiating the need for algorithms to learn from unlabeled data. In this work, we propose employing self-supervised methods. To that end, we trained with three self-supervised algorithms on a large corpus of unlabeled dental images, which contained 38K bitewing radiographs (BWRs). We then applied the learned neural network representations on tooth-level dental caries classification, for which we utilized labels extracted from electronic health records (EHRs). Finally, a holdout test-set was established, which consisted of 343 BWRs and was annotated by three dental professionals and approved by a senior dentist. This test-set was used to evaluate the fine-tuned caries classification models. Our experimental results demonstrate the obtained gains by pretraining models using self-supervised algorithms. These include improved caries classification performance (6 p.p. increase in sensitivity) and, most importantly, improved label-efficiency. In other words, the resulting models can be fine-tuned using few labels (annotations). Our results show that using as few as 18 annotations can produce >= 45\% sensitivity, which is comparable to human-level diagnostic performance. This study shows that self-supervision can provide gains in medical image analysis, particularly when obtaining labels is costly and expensive.}, language = {en} } @article{DeFreitasJohnsonGoldenetal.2021, author = {De Freitas, Jessica K. and Johnson, Kipp W. and Golden, Eddye and Nadkarni, Girish N. and Dudley, Joel T. and B{\"o}ttinger, Erwin and Glicksberg, Benjamin S. and Miotto, Riccardo}, title = {Phe2vec}, series = {Patterns}, volume = {2}, journal = {Patterns}, number = {9}, publisher = {Elsevier}, address = {Amsterdam}, issn = {2666-3899}, doi = {10.1016/j.patter.2021.100337}, pages = {9}, year = {2021}, abstract = {Robust phenotyping of patients from electronic health records (EHRs) at scale is a challenge in clinical informatics. Here, we introduce Phe2vec, an automated framework for disease phenotyping from EHRs based on unsupervised learning and assess its effectiveness against standard rule-based algorithms from Phenotype KnowledgeBase (PheKB). Phe2vec is based on pre-computing embeddings of medical concepts and patients' clinical history. Disease phenotypes are then derived from a seed concept and its neighbors in the embedding space. Patients are linked to a disease if their embedded representation is close to the disease phenotype. Comparing Phe2vec and PheKB cohorts head-to-head using chart review, Phe2vec performed on par or better in nine out of ten diseases. Differently from other approaches, it can scale to any condition and was validated against widely adopted expert-based standards. Phe2vec aims to optimize clinical informatics research by augmenting current frameworks to characterize patients by condition and derive reliable disease cohorts.}, language = {en} } @article{BorchertMockTomczaketal.2021, author = {Borchert, Florian and Mock, Andreas and Tomczak, Aurelie and H{\"u}gel, Jonas and Alkarkoukly, Samer and Knurr, Alexander and Volckmar, Anna-Lena and Stenzinger, Albrecht and Schirmacher, Peter and Debus, J{\"u}rgen and J{\"a}ger, Dirk and Longerich, Thomas and Fr{\"o}hling, Stefan and Eils, Roland and Bougatf, Nina and Sax, Ulrich and Schapranow, Matthieu-Patrick}, title = {Knowledge bases and software support for variant interpretation in precision oncology}, series = {Briefings in bioinformatics}, volume = {22}, journal = {Briefings in bioinformatics}, number = {6}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1467-5463}, doi = {10.1093/bib/bbab134}, pages = {17}, year = {2021}, abstract = {Precision oncology is a rapidly evolving interdisciplinary medical specialty. Comprehensive cancer panels are becoming increasingly available at pathology departments worldwide, creating the urgent need for scalable cancer variant annotation and molecularly informed treatment recommendations. A wealth of mainly academia-driven knowledge bases calls for software tools supporting the multi-step diagnostic process. We derive a comprehensive list of knowledge bases relevant for variant interpretation by a review of existing literature followed by a survey among medical experts from university hospitals in Germany. In addition, we review cancer variant interpretation tools, which integrate multiple knowledge bases. We categorize the knowledge bases along the diagnostic process in precision oncology and analyze programmatic access options as well as the integration of knowledge bases into software tools. The most commonly used knowledge bases provide good programmatic access options and have been integrated into a range of software tools. For the wider set of knowledge bases, access options vary across different parts of the diagnostic process. Programmatic access is limited for information regarding clinical classifications of variants and for therapy recommendations. The main issue for databases used for biological classification of pathogenic variants and pathway context information is the lack of standardized interfaces. There is no single cancer variant interpretation tool that integrates all identified knowledge bases. Specialized tools are available and need to be further developed for different steps in the diagnostic process.}, language = {en} }