@article{DespujolZabalaAlarioHoyosTurroRibaltaetal.2024, author = {Despujol Zabala, Ignacio and Alario Hoyos, Carlos and Turr{\´o} Ribalta, Carlos and Delgado Kloos, Carlos and Montoro Manrique, Germ{\´a}n and Busquets Mataix, Jaime}, title = {Transforming Open Edx into the next On-Campus LMS}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62512}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-625122}, pages = {5}, year = {2024}, abstract = {Open edX is an incredible platform to deliver MOOCs and SPOCs, designed to be robust and support hundreds of thousands of students at the same time. Nevertheless, it lacks a lot of the fine-grained functionality needed to handle students individually in an on-campus course. This short session will present the ongoing project undertaken by the 6 public universities of the Region of Madrid plus the Universitat Polit{\`e}cnica de Val{\`e}ncia, in the framework of a national initiative called UniDigital, funded by the Ministry of Universities of Spain within the Plan de Recuperaci{\´o}n, Transformaci{\´o}n y Resiliencia of the European Union. This project, led by three of these Spanish universities (UC3M, UPV, UAM), is investing more than half a million euros with the purpose of bringing the Open edX platform closer to the functionalities required for an LMS to support on-campus teaching. The aim of the project is to coordinate what is going to be developed with the Open edX development community, so these developments are incorporated into the core of the Open edX platform in its next releases. Features like a complete redesign of platform analytics to make them real-time, the creation of dashboards based on these analytics, the integration of a system for customized automatic feedback, improvement of exams and tasks and the extension of grading capabilities, improvements in the graphical interfaces for both students and teachers, the extension of the emailing capabilities, redesign of the file management system, integration of H5P content, the integration of a tool to create mind maps, the creation of a system to detect students at risk, or the integration of an advanced voice assistant and a gamification mobile app, among others, are part of the functionalities to be developed. The idea is to transform a first-class MOOC platform into the next on-campus LMS.}, language = {en} } @phdthesis{Vitagliano2024, author = {Vitagliano, Gerardo}, title = {Modeling the structure of tabular files for data preparation}, doi = {10.25932/publishup-62435}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624351}, school = {Universit{\"a}t Potsdam}, pages = {ii, 114}, year = {2024}, abstract = {To manage tabular data files and leverage their content in a given downstream task, practitioners often design and execute complex transformation pipelines to prepare them. The complexity of such pipelines stems from different factors, including the nature of the preparation tasks, often exploratory or ad-hoc to specific datasets; the large repertory of tools, algorithms, and frameworks that practitioners need to master; and the volume, variety, and velocity of the files to be prepared. Metadata plays a fundamental role in reducing this complexity: characterizing a file assists end users in the design of data preprocessing pipelines, and furthermore paves the way for suggestion, automation, and optimization of data preparation tasks. Previous research in the areas of data profiling, data integration, and data cleaning, has focused on extracting and characterizing metadata regarding the content of tabular data files, i.e., about the records and attributes of tables. Content metadata are useful for the latter stages of a preprocessing pipeline, e.g., error correction, duplicate detection, or value normalization, but they require a properly formed tabular input. Therefore, these metadata are not relevant for the early stages of a preparation pipeline, i.e., to correctly parse tables out of files. In this dissertation, we turn our focus to what we call the structure of a tabular data file, i.e., the set of characters within a file that do not represent data values but are required to parse and understand the content of the file. We provide three different approaches to represent file structure, an explicit representation based on context-free grammars; an implicit representation based on file-wise similarity; and a learned representation based on machine learning. In our first contribution, we use the grammar-based representation to characterize a set of over 3000 real-world csv files and identify multiple structural issues that let files deviate from the csv standard, e.g., by having inconsistent delimiters or containing multiple tables. We leverage our learnings about real-world files and propose Pollock, a benchmark to test how well systems parse csv files that have a non-standard structure, without any previous preparation. We report on our experiments on using Pollock to evaluate the performance of 16 real-world data management systems. Following, we characterize the structure of files implicitly, by defining a measure of structural similarity for file pairs. We design a novel algorithm to compute this measure, which is based on a graph representation of the files' content. We leverage this algorithm and propose Mondrian, a graphical system to assist users in identifying layout templates in a dataset, classes of files that have the same structure, and therefore can be prepared by applying the same preparation pipeline. Finally, we introduce MaGRiTTE, a novel architecture that uses self-supervised learning to automatically learn structural representations of files in the form of vectorial embeddings at three different levels: cell level, row level, and file level. We experiment with the application of structural embeddings for several tasks, namely dialect detection, row classification, and data preparation efforts estimation. Our experimental results show that structural metadata, either identified explicitly on parsing grammars, derived implicitly as file-wise similarity, or learned with the help of machine learning architectures, is fundamental to automate several tasks, to scale up preparation to large quantities of files, and to provide repeatable preparation pipelines.}, language = {en} } @phdthesis{Ghahremani2024, author = {Ghahremani, Sona}, title = {Incremental self-adaptation of dynamic architectures attaining optimality and scalability}, doi = {10.25932/publishup-62423}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624232}, school = {Universit{\"a}t Potsdam}, pages = {xii, 285}, year = {2024}, abstract = {The landscape of software self-adaptation is shaped in accordance with the need to cost-effectively achieve and maintain (software) quality at runtime and in the face of dynamic operation conditions. Optimization-based solutions perform an exhaustive search in the adaptation space, thus they may provide quality guarantees. However, these solutions render the attainment of optimal adaptation plans time-intensive, thereby hindering scalability. Conversely, deterministic rule-based solutions yield only sub-optimal adaptation decisions, as they are typically bound by design-time assumptions, yet they offer efficient processing and implementation, readability, expressivity of individual rules supporting early verification. Addressing the quality-cost trade-of requires solutions that simultaneously exhibit the scalability and cost-efficiency of rulebased policy formalism and the optimality of optimization-based policy formalism as explicit artifacts for adaptation. Utility functions, i.e., high-level specifications that capture system objectives, support the explicit treatment of quality-cost trade-off. Nevertheless, non-linearities, complex dynamic architectures, black-box models, and runtime uncertainty that makes the prior knowledge obsolete are a few of the sources of uncertainty and subjectivity that render the elicitation of utility non-trivial. This thesis proposes a twofold solution for incremental self-adaptation of dynamic architectures. First, we introduce Venus, a solution that combines in its design a ruleand an optimization-based formalism enabling optimal and scalable adaptation of dynamic architectures. Venus incorporates rule-like constructs and relies on utility theory for decision-making. Using a graph-based representation of the architecture, Venus captures rules as graph patterns that represent architectural fragments, thus enabling runtime extensibility and, in turn, support for dynamic architectures; the architecture is evaluated by assigning utility values to fragments; pattern-based definition of rules and utility enables incremental computation of changes on the utility that result from rule executions, rather than evaluating the complete architecture, which supports scalability. Second, we introduce HypeZon, a hybrid solution for runtime coordination of multiple off-the-shelf adaptation policies, which typically offer only partial satisfaction of the quality and cost requirements. Realized based on meta-self-aware architectures, HypeZon complements Venus by re-using existing policies at runtime for balancing the quality-cost trade-off. The twofold solution of this thesis is integrated in an adaptation engine that leverages state- and event-based principles for incremental execution, therefore, is scalable for large and dynamic software architectures with growing size and complexity. The utility elicitation challenge is resolved by defining a methodology to train utility-change prediction models. The thesis addresses the quality-cost trade-off in adaptation of dynamic software architectures via design-time combination (Venus) and runtime coordination (HypeZon) of rule- and optimization-based policy formalisms, while offering supporting mechanisms for optimal, cost-effective, scalable, and robust adaptation. The solutions are evaluated according to a methodology that is obtained based on our systematic literature review of evaluation in self-healing systems; the applicability and effectiveness of the contributions are demonstrated to go beyond the state-of-the-art in coverage of a wide spectrum of the problem space for software self-adaptation.}, language = {en} } @phdthesis{Limberger2024, author = {Limberger, Daniel}, title = {Concepts and techniques for 3D-embedded treemaps and their application to software visualization}, doi = {10.25932/publishup-63201}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-632014}, school = {Universit{\"a}t Potsdam}, pages = {xi, 118}, year = {2024}, abstract = {This thesis addresses concepts and techniques for interactive visualization of hierarchical data using treemaps. It explores (1) how treemaps can be embedded in 3D space to improve their information content and expressiveness, (2) how the readability of treemaps can be improved using level-of-detail and degree-of-interest techniques, and (3) how to design and implement a software framework for the real-time web-based rendering of treemaps embedded in 3D. With a particular emphasis on their application, use cases from software analytics are taken to test and evaluate the presented concepts and techniques. Concerning the first challenge, this thesis shows that a 3D attribute space offers enhanced possibilities for the visual mapping of data compared to classical 2D treemaps. In particular, embedding in 3D allows for improved implementation of visual variables (e.g., by sketchiness and color weaving), provision of new visual variables (e.g., by physically based materials and in situ templates), and integration of visual metaphors (e.g., by reference surfaces and renderings of natural phenomena) into the three-dimensional representation of treemaps. For the second challenge—the readability of an information visualization—the work shows that the generally higher visual clutter and increased cognitive load typically associated with three-dimensional information representations can be kept low in treemap-based representations of both small and large hierarchical datasets. By introducing an adaptive level-of-detail technique, we cannot only declutter the visualization results, thereby reducing cognitive load and mitigating occlusion problems, but also summarize and highlight relevant data. Furthermore, this approach facilitates automatic labeling, supports the emphasis on data outliers, and allows visual variables to be adjusted via degree-of-interest measures. The third challenge is addressed by developing a real-time rendering framework with WebGL and accumulative multi-frame rendering. The framework removes hardware constraints and graphics API requirements, reduces interaction response times, and simplifies high-quality rendering. At the same time, the implementation effort for a web-based deployment of treemaps is kept reasonable. The presented visualization concepts and techniques are applied and evaluated for use cases in software analysis. In this domain, data about software systems, especially about the state and evolution of the source code, does not have a descriptive appearance or natural geometric mapping, making information visualization a key technology here. In particular, software source code can be visualized with treemap-based approaches because of its inherently hierarchical structure. With treemaps embedded in 3D, we can create interactive software maps that visually map, software metrics, software developer activities, or information about the evolution of software systems alongside their hierarchical module structure. Discussions on remaining challenges and opportunities for future research for 3D-embedded treemaps and their applications conclude the thesis.}, language = {en} } @phdthesis{AlhosseiniAlmodarresiYasin2024, author = {Alhosseini Almodarresi Yasin, Seyed Ali}, title = {Classification, prediction and evaluation of graph neural networks on online social media platforms}, doi = {10.25932/publishup-62642}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-626421}, school = {Universit{\"a}t Potsdam}, pages = {xviii, 78}, year = {2024}, abstract = {The vast amount of data generated on social media platforms have made them a valuable source of information for businesses, governments and researchers. Social media data can provide insights into user behavior, preferences, and opinions. In this work, we address two important challenges in social media analytics. Predicting user engagement with online content has become a critical task for content creators to increase user engagement and reach larger audiences. Traditional user engagement prediction approaches rely solely on features derived from the user and content. However, a new class of deep learning methods based on graphs captures not only the content features but also the graph structure of social media networks. This thesis proposes a novel Graph Neural Network (GNN) approach to predict user interaction with tweets. The proposed approach combines the features of users, tweets and their engagement graphs. The tweet text features are extracted using pre-trained embeddings from language models, and a GNN layer is used to embed the user in a vector space. The GNN model then combines the features and graph structure to predict user engagement. The proposed approach achieves an accuracy value of 94.22\% in classifying user interactions, including likes, retweets, replies, and quotes. Another major challenge in social media analysis is detecting and classifying social bot accounts. Social bots are automated accounts used to manipulate public opinion by spreading misinformation or generating fake interactions. Detecting social bots is critical to prevent their negative impact on public opinion and trust in social media. In this thesis, we classify social bots on Twitter by applying Graph Neural Networks. The proposed approach uses a combination of both the features of a node and an aggregation of the features of a node's neighborhood to classify social bot accounts. Our final results indicate a 6\% improvement in the area under the curve score in the final predictions through the utilization of GNN. Overall, our work highlights the importance of social media data and the potential of new methods such as GNNs to predict user engagement and detect social bots. These methods have important implications for improving the quality and reliability of information on social media platforms and mitigating the negative impact of social bots on public opinion and discourse.}, language = {en} } @phdthesis{Benson2024, author = {Benson, Lawrence}, title = {Efficient state management with persistent memory}, doi = {10.25932/publishup-62563}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-625637}, school = {Universit{\"a}t Potsdam}, pages = {xiii, 124}, year = {2024}, abstract = {Efficiently managing large state is a key challenge for data management systems. Traditionally, state is split into fast but volatile state in memory for processing and persistent but slow state on secondary storage for durability. Persistent memory (PMem), as a new technology in the storage hierarchy, blurs the lines between these states by offering both byte-addressability and low latency like DRAM as well persistence like secondary storage. These characteristics have the potential to cause a major performance shift in database systems. Driven by the potential impact that PMem has on data management systems, in this thesis we explore their use of PMem. We first evaluate the performance of real PMem hardware in the form of Intel Optane in a wide range of setups. To this end, we propose PerMA-Bench, a configurable benchmark framework that allows users to evaluate the performance of customizable database-related PMem access. Based on experimental results obtained with PerMA-Bench, we discuss findings and identify general and implementation-specific aspects that influence PMem performance and should be considered in future work to improve PMem-aware designs. We then propose Viper, a hybrid PMem-DRAM key-value store. Based on PMem-aware access patterns, we show how to leverage PMem and DRAM efficiently to design a key database component. Our evaluation shows that Viper outperforms existing key-value stores by 4-18x for inserts while offering full data persistence and achieving similar or better lookup performance. Next, we show which changes must be made to integrate PMem components into larger systems. By the example of stream processing engines, we highlight limitations of current designs and propose a prototype engine that overcomes these limitations. This allows our prototype to fully leverage PMem's performance for its internal state management. Finally, in light of Optane's discontinuation, we discuss how insights from PMem research can be transferred to future multi-tier memory setups by the example of Compute Express Link (CXL). Overall, we show that PMem offers high performance for state management, bridging the gap between fast but volatile DRAM and persistent but slow secondary storage. Although Optane was discontinued, new memory technologies are continuously emerging in various forms and we outline how novel designs for them can build on insights from existing PMem research.}, language = {en} } @phdthesis{Marx2024, author = {Marx, Carolin Valerie}, title = {Escalation of commitment in information systems projects: a cognitive-affective perspective}, doi = {10.25932/publishup-62696}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-626969}, school = {Universit{\"a}t Potsdam}, pages = {174}, year = {2024}, abstract = {While information systems (IS) projects are pivotal in guiding organizational strategies and sustaining competitive advantages, they frequently overrun budgets, extend beyond timelines, and experience high failure rates. This dissertation delves into the psychological micro-foundations of human behavior - specifically cognition and emotion - in relation to a prevalent issue in IS project management: the tendency to persist with failing courses of action, also called escalation of commitment (EoC). Through a mixed-methods research approach, this study investigates the emotional and cognitive bases of decision-making during IS project escalation and its evolution over time. The results of a psychophysiological laboratory experiment provide evidence for the predictions on the role of negative and complex situational integral emotions of Cognitive Dissonance over Coping Theory and add to a better understanding of how escalation tendencies change during sequential decision-making due to cognitive learning effects. Using psychophysiological measures, including data triangulation between electrodermal and cardiovascular activity and AI-based analysis of facial micro-expressions, this research reveals physiological markers of behavioral escalation tendencies. Complementing the experiment, a qualitative analysis using free-form narration during decision-making simulations shows that decision-makers employ varied cognitive reasoning patterns to justify escalating behaviors, suggesting a sequence of four distinct cognitive phases. By integrating both qualitative and quantitative findings, this dissertation offers a comprehensive theoretical framework of how cognition and emotion shape behavioral EoC over time. I propose that escalation is a cyclical adaptation of mental models, distinguished by shifts in cognitive reasoning patterns, temporal cognition mode variations, and interactions with situational emotions and their anticipation. The primary contribution of this dissertation lies in disentangling the emotional and cognitive mechanisms that drive IS project escalation. The findings provide the basis for developing de-escalation strategies, thereby helping to improve decision-making under uncertainty. Stakeholders involved in IS projects that get "off track" should be aware of the tendency to persist with failing courses of action and the importance of the underlying emotional and cognitive dynamics.}, language = {de} } @phdthesis{Halfpap2024, author = {Halfpap, Stefan}, title = {Integer linear programming-based heuristics for partially replicated database clusters and selecting indexes}, doi = {10.25932/publishup-63361}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-633615}, school = {Universit{\"a}t Potsdam}, pages = {iii, 185}, year = {2024}, abstract = {Column-oriented database systems can efficiently process transactional and analytical queries on a single node. However, increasing or peak analytical loads can quickly saturate single-node database systems. Then, a common scale-out option is using a database cluster with a single primary node for transaction processing and read-only replicas. Using (the naive) full replication, queries are distributed among nodes independently of the accessed data. This approach is relatively expensive because all nodes must store all data and apply all data modifications caused by inserts, deletes, or updates. In contrast to full replication, partial replication is a more cost-efficient implementation: Instead of duplicating all data to all replica nodes, partial replicas store only a subset of the data while being able to process a large workload share. Besides lower storage costs, partial replicas enable (i) better scaling because replicas must potentially synchronize only subsets of the data modifications and thus have more capacity for read-only queries and (ii) better elasticity because replicas have to load less data and can be set up faster. However, splitting the overall workload evenly among the replica nodes while optimizing the data allocation is a challenging assignment problem. The calculation of optimized data allocations in a partially replicated database cluster can be modeled using integer linear programming (ILP). ILP is a common approach for solving assignment problems, also in the context of database systems. Because ILP is not scalable, existing approaches (also for calculating partial allocations) often fall back to simple (e.g., greedy) heuristics for larger problem instances. Simple heuristics may work well but can lose optimization potential. In this thesis, we present optimal and ILP-based heuristic programming models for calculating data fragment allocations for partially replicated database clusters. Using ILP, we are flexible to extend our models to (i) consider data modifications and reallocations and (ii) increase the robustness of allocations to compensate for node failures and workload uncertainty. We evaluate our approaches for TPC-H, TPC-DS, and a real-world accounting workload and compare the results to state-of-the-art allocation approaches. Our evaluations show significant improvements for varied allocation's properties: Compared to existing approaches, we can, for example, (i) almost halve the amount of allocated data, (ii) improve the throughput in case of node failures and workload uncertainty while using even less memory, (iii) halve the costs of data modifications, and (iv) reallocate less than 90\% of data when adding a node to the cluster. Importantly, we can calculate the corresponding ILP-based heuristic solutions within a few seconds. Finally, we demonstrate that the ideas of our ILP-based heuristics are also applicable to the index selection problem.}, language = {en} } @phdthesis{Huegle2024, author = {Huegle, Johannes}, title = {Causal discovery in practice: Non-parametric conditional independence testing and tooling for causal discovery}, doi = {10.25932/publishup-63582}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-635820}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 156}, year = {2024}, abstract = {Knowledge about causal structures is crucial for decision support in various domains. For example, in discrete manufacturing, identifying the root causes of failures and quality deviations that interrupt the highly automated production process requires causal structural knowledge. However, in practice, root cause analysis is usually built upon individual expert knowledge about associative relationships. But, "correlation does not imply causation", and misinterpreting associations often leads to incorrect conclusions. Recent developments in methods for causal discovery from observational data have opened the opportunity for a data-driven examination. Despite its potential for data-driven decision support, omnipresent challenges impede causal discovery in real-world scenarios. In this thesis, we make a threefold contribution to improving causal discovery in practice. (1) The growing interest in causal discovery has led to a broad spectrum of methods with specific assumptions on the data and various implementations. Hence, application in practice requires careful consideration of existing methods, which becomes laborious when dealing with various parameters, assumptions, and implementations in different programming languages. Additionally, evaluation is challenging due to the lack of ground truth in practice and limited benchmark data that reflect real-world data characteristics. To address these issues, we present a platform-independent modular pipeline for causal discovery and a ground truth framework for synthetic data generation that provides comprehensive evaluation opportunities, e.g., to examine the accuracy of causal discovery methods in case of inappropriate assumptions. (2) Applying constraint-based methods for causal discovery requires selecting a conditional independence (CI) test, which is particularly challenging in mixed discrete-continuous data omnipresent in many real-world scenarios. In this context, inappropriate assumptions on the data or the commonly applied discretization of continuous variables reduce the accuracy of CI decisions, leading to incorrect causal structures. Therefore, we contribute a non-parametric CI test leveraging k-nearest neighbors methods and prove its statistical validity and power in mixed discrete-continuous data, as well as the asymptotic consistency when used in constraint-based causal discovery. An extensive evaluation of synthetic and real-world data shows that the proposed CI test outperforms state-of-the-art approaches in the accuracy of CI testing and causal discovery, particularly in settings with low sample sizes. (3) To show the applicability and opportunities of causal discovery in practice, we examine our contributions in real-world discrete manufacturing use cases. For example, we showcase how causal structural knowledge helps to understand unforeseen production downtimes or adds decision support in case of failures and quality deviations in automotive body shop assembly lines.}, language = {en} } @phdthesis{Katzmann2023, author = {Katzmann, Maximilian}, title = {About the analysis of algorithms on networks with underlying hyperbolic geometry}, doi = {10.25932/publishup-58296}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-582965}, school = {Universit{\"a}t Potsdam}, pages = {xi, 191}, year = {2023}, abstract = {Many complex systems that we encounter in the world can be formalized using networks. Consequently, they have been in the focus of computer science for decades, where algorithms are developed to understand and utilize these systems. Surprisingly, our theoretical understanding of these algorithms and their behavior in practice often diverge significantly. In fact, they tend to perform much better on real-world networks than one would expect when considering the theoretical worst-case bounds. One way of capturing this discrepancy is the average-case analysis, where the idea is to acknowledge the differences between practical and worst-case instances by focusing on networks whose properties match those of real graphs. Recent observations indicate that good representations of real-world networks are obtained by assuming that a network has an underlying hyperbolic geometry. In this thesis, we demonstrate that the connection between networks and hyperbolic space can be utilized as a powerful tool for average-case analysis. To this end, we first introduce strongly hyperbolic unit disk graphs and identify the famous hyperbolic random graph model as a special case of them. We then consider four problems where recent empirical results highlight a gap between theory and practice and use hyperbolic graph models to explain these phenomena theoretically. First, we develop a routing scheme, used to forward information in a network, and analyze its efficiency on strongly hyperbolic unit disk graphs. For the special case of hyperbolic random graphs, our algorithm beats existing performance lower bounds. Afterwards, we use the hyperbolic random graph model to theoretically explain empirical observations about the performance of the bidirectional breadth-first search. Finally, we develop algorithms for computing optimal and nearly optimal vertex covers (problems known to be NP-hard) and show that, on hyperbolic random graphs, they run in polynomial and quasi-linear time, respectively. Our theoretical analyses reveal interesting properties of hyperbolic random graphs and our empirical studies present evidence that these properties, as well as our algorithmic improvements translate back into practice.}, language = {en} } @phdthesis{Roumen2023, author = {Roumen, Thijs}, title = {Portable models for laser cutting}, doi = {10.25932/publishup-57814}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-578141}, school = {Universit{\"a}t Potsdam}, pages = {xx, 170}, year = {2023}, abstract = {Laser cutting is a fast and precise fabrication process. This makes laser cutting a powerful process in custom industrial production. Since the patents on the original technology started to expire, a growing community of tech-enthusiasts embraced the technology and started sharing the models they fabricate online. Surprisingly, the shared models appear to largely be one-offs (e.g., they proudly showcase what a single person can make in one afternoon). For laser cutting to become a relevant mainstream phenomenon (as opposed to the current tech enthusiasts and industry users), it is crucial to enable users to reproduce models made by more experienced modelers, and to build on the work of others instead of creating one-offs. We create a technological basis that allows users to build on the work of others—a progression that is currently held back by the use of exchange formats that disregard mechanical differences between machines and therefore overlook implications with respect to how well parts fit together mechanically (aka engineering fit). For the field to progress, we need a machine-independent sharing infrastructure. In this thesis, we outline three approaches that together get us closer to this: (1) 2D cutting plans that are tolerant to machine variations. Our initial take is a minimally invasive approach: replacing machine-specific elements in cutting plans with more tolerant elements using mechanical hacks like springs and wedges. The resulting models fabricate on any consumer laser cutter and in a range of materials. (2) sharing models in 3D. To allow building on the work of others, we build a 3D modeling environment for laser cutting (kyub). After users design a model, they export their 3D models to 2D cutting plans optimized for the machine and material at hand. We extend this volumetric environment with tools to edit individual plates, allowing users to leverage the efficiency of volumetric editing while having control over the most detailed elements in laser-cutting (plates) (3) converting legacy 2D cutting plans to 3D models. To handle legacy models, we build software to interactively reconstruct 3D models from 2D cutting plans. This allows users to reuse the models in more productive ways. We revisit this by automating the assembly process for a large subset of models. The above-mentioned software composes a larger system (kyub, 140,000 lines of code). This system integration enables the push towards actual use, which we demonstrate through a range of workshops where users build complex models such as fully functional guitars. By simplifying sharing and re-use and the resulting increase in model complexity, this line of work forms a small step to enable personal fabrication to scale past the maker phenomenon, towards a mainstream phenomenon—the same way that other fields, such as print (postscript) and ultimately computing itself (portable programming languages, etc.) reached mass adoption.}, language = {en} } @phdthesis{Bano2023, author = {Bano, Dorina}, title = {Discovering data models from event logs}, doi = {10.25932/publishup-58542}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-585427}, school = {Universit{\"a}t Potsdam}, pages = {xvii, 137}, year = {2023}, abstract = {In the last two decades, process mining has developed from a niche discipline to a significant research area with considerable impact on academia and industry. Process mining enables organisations to identify the running business processes from historical execution data. The first requirement of any process mining technique is an event log, an artifact that represents concrete business process executions in the form of sequence of events. These logs can be extracted from the organization's information systems and are used by process experts to retrieve deep insights from the organization's running processes. Considering the events pertaining to such logs, the process models can be automatically discovered and enhanced or annotated with performance-related information. Besides behavioral information, event logs contain domain specific data, albeit implicitly. However, such data are usually overlooked and, thus, not utilized to their full potential. Within the process mining area, we address in this thesis the research gap of discovering, from event logs, the contextual information that cannot be captured by applying existing process mining techniques. Within this research gap, we identify four key problems and tackle them by looking at an event log from different angles. First, we address the problem of deriving an event log in the absence of a proper database access and domain knowledge. The second problem is related to the under-utilization of the implicit domain knowledge present in an event log that can increase the understandability of the discovered process model. Next, there is a lack of a holistic representation of the historical data manipulation at the process model level of abstraction. Last but not least, each process model presumes to be independent of other process models when discovered from an event log, thus, ignoring possible data dependencies between processes within an organization. For each of the problems mentioned above, this thesis proposes a dedicated method. The first method provides a solution to extract an event log only from the transactions performed on the database that are stored in the form of redo logs. The second method deals with discovering the underlying data model that is implicitly embedded in the event log, thus, complementing the discovered process model with important domain knowledge information. The third method captures, on the process model level, how the data affects the running process instances. Lastly, the fourth method is about the discovery of the relations between business processes (i.e., how they exchange data) from a set of event logs and explicitly representing such complex interdependencies in a business process architecture. All the methods introduced in this thesis are implemented as a prototype and their feasibility is proven by being applied on real-life event logs.}, language = {en} } @phdthesis{Sakizloglou2023, author = {Sakizloglou, Lucas}, title = {Evaluating temporal queries over history-aware architectural runtime models}, doi = {10.25932/publishup-60439}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-604396}, school = {Universit{\"a}t Potsdam}, pages = {v, 168}, year = {2023}, abstract = {In model-driven engineering, the adaptation of large software systems with dynamic structure is enabled by architectural runtime models. Such a model represents an abstract state of the system as a graph of interacting components. Every relevant change in the system is mirrored in the model and triggers an evaluation of model queries, which search the model for structural patterns that should be adapted. This thesis focuses on a type of runtime models where the expressiveness of the model and model queries is extended to capture past changes and their timing. These history-aware models and temporal queries enable more informed decision-making during adaptation, as they support the formulation of requirements on the evolution of the pattern that should be adapted. However, evaluating temporal queries during adaptation poses significant challenges. First, it implies the capability to specify and evaluate requirements on the structure, as well as the ordering and timing in which structural changes occur. Then, query answers have to reflect that the history-aware model represents the architecture of a system whose execution may be ongoing, and thus answers may depend on future changes. Finally, query evaluation needs to be adequately fast and memory-efficient despite the increasing size of the history---especially for models that are altered by numerous, rapid changes. The thesis presents a query language and a querying approach for the specification and evaluation of temporal queries. These contributions aim to cope with the challenges of evaluating temporal queries at runtime, a prerequisite for history-aware architectural monitoring and adaptation which has not been systematically treated by prior model-based solutions. The distinguishing features of our contributions are: the specification of queries based on a temporal logic which encodes structural patterns as graphs; the provision of formally precise query answers which account for timing constraints and ongoing executions; the incremental evaluation which avoids the re-computation of query answers after each change; and the option to discard history that is no longer relevant to queries. The query evaluation searches the model for occurrences of a pattern whose evolution satisfies a temporal logic formula. Therefore, besides model-driven engineering, another related research community is runtime verification. The approach differs from prior logic-based runtime verification solutions by supporting the representation and querying of structure via graphs and graph queries, respectively, which is more efficient for queries with complex patterns. We present a prototypical implementation of the approach and measure its speed and memory consumption in monitoring and adaptation scenarios from two application domains, with executions of an increasing size. We assess scalability by a comparison to the state-of-the-art from both related research communities. The implementation yields promising results, which pave the way for sophisticated history-aware self-adaptation solutions and indicate that the approach constitutes a highly effective technique for runtime monitoring on an architectural level.}, language = {en} } @phdthesis{Afifi2023, author = {Afifi, Haitham}, title = {Wireless In-Network Processing for Multimedia Applications}, doi = {10.25932/publishup-60437}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-604371}, school = {Universit{\"a}t Potsdam}, pages = {xiii, 233}, year = {2023}, abstract = {With the recent growth of sensors, cloud computing handles the data processing of many applications. Processing some of this data on the cloud raises, however, many concerns regarding, e.g., privacy, latency, or single points of failure. Alternatively, thanks to the development of embedded systems, smart wireless devices can share their computation capacity, creating a local wireless cloud for in-network processing. In this context, the processing of an application is divided into smaller jobs so that a device can run one or more jobs. The contribution of this thesis to this scenario is divided into three parts. In part one, I focus on wireless aspects, such as power control and interference management, for deciding which jobs to run on which node and how to route data between nodes. Hence, I formulate optimization problems and develop heuristic and meta-heuristic algorithms to allocate wireless and computation resources. Additionally, to deal with multiple applications competing for these resources, I develop a reinforcement learning (RL) admission controller to decide which application should be admitted. Next, I look into acoustic applications to improve wireless throughput by using microphone clock synchronization to synchronize wireless transmissions. In the second part, I jointly work with colleagues from the acoustic processing field to optimize both network and application (i.e., acoustic) qualities. My contribution focuses on the network part, where I study the relation between acoustic and network qualities when selecting a subset of microphones for collecting audio data or selecting a subset of optional jobs for processing these data; too many microphones or too many jobs can lessen quality by unnecessary delays. Hence, I develop RL solutions to select the subset of microphones under network constraints when the speaker is moving while still providing good acoustic quality. Furthermore, I show that autonomous vehicles carrying microphones improve the acoustic qualities of different applications. Accordingly, I develop RL solutions (single and multi-agent ones) for controlling these vehicles. In the third part, I close the gap between theory and practice. I describe the features of my open-source framework used as a proof of concept for wireless in-network processing. Next, I demonstrate how to run some algorithms developed by colleagues from acoustic processing using my framework. I also use the framework for studying in-network delays (wireless and processing) using different distributions of jobs and network topologies.}, language = {en} } @phdthesis{Lindinger2023, author = {Lindinger, Jakob}, title = {Variational inference for composite Gaussian process models}, doi = {10.25932/publishup-60444}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-604441}, school = {Universit{\"a}t Potsdam}, pages = {xi, 122}, year = {2023}, abstract = {Most machine learning methods provide only point estimates when being queried to predict on new data. This is problematic when the data is corrupted by noise, e.g. from imperfect measurements, or when the queried data point is very different to the data that the machine learning model has been trained with. Probabilistic modelling in machine learning naturally equips predictions with corresponding uncertainty estimates which allows a practitioner to incorporate information about measurement noise into the modelling process and to know when not to trust the predictions. A well-understood, flexible probabilistic framework is provided by Gaussian processes that are ideal as building blocks of probabilistic models. They lend themself naturally to the problem of regression, i.e., being given a set of inputs and corresponding observations and then predicting likely observations for new unseen inputs, and can also be adapted to many more machine learning tasks. However, exactly inferring the optimal parameters of such a Gaussian process model (in a computationally tractable manner) is only possible for regression tasks in small data regimes. Otherwise, approximate inference methods are needed, the most prominent of which is variational inference. In this dissertation we study models that are composed of Gaussian processes embedded in other models in order to make those more flexible and/or probabilistic. The first example are deep Gaussian processes which can be thought of as a small network of Gaussian processes and which can be employed for flexible regression. The second model class that we study are Gaussian process state-space models. These can be used for time-series modelling, i.e., the task of being given a stream of data ordered by time and then predicting future observations. For both model classes the state-of-the-art approaches offer a trade-off between expressive models and computational properties (e.g. speed or convergence properties) and mostly employ variational inference. Our goal is to improve inference in both models by first getting a deep understanding of the existing methods and then, based on this, to design better inference methods. We achieve this by either exploring the existing trade-offs or by providing general improvements applicable to multiple methods. We first provide an extensive background, introducing Gaussian processes and their sparse (approximate and efficient) variants. We continue with a description of the models under consideration in this thesis, deep Gaussian processes and Gaussian process state-space models, including detailed derivations and a theoretical comparison of existing methods. Then we start analysing deep Gaussian processes more closely: Trading off the properties (good optimisation versus expressivity) of state-of-the-art methods in this field, we propose a new variational inference based approach. We then demonstrate experimentally that our new algorithm leads to better calibrated uncertainty estimates than existing methods. Next, we turn our attention to Gaussian process state-space models, where we closely analyse the theoretical properties of existing methods.The understanding gained in this process leads us to propose a new inference scheme for general Gaussian process state-space models that incorporates effects on multiple time scales. This method is more efficient than previous approaches for long timeseries and outperforms its comparison partners on data sets in which effects on multiple time scales (fast and slowly varying dynamics) are present. Finally, we propose a new inference approach for Gaussian process state-space models that trades off the properties of state-of-the-art methods in this field. By combining variational inference with another approximate inference method, the Laplace approximation, we design an efficient algorithm that outperforms its comparison partners since it achieves better calibrated uncertainties.}, language = {en} } @phdthesis{Perscheid2023, author = {Perscheid, Cindy}, title = {Integrative biomarker detection using prior knowledge on gene expression data sets}, doi = {10.25932/publishup-58241}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-582418}, school = {Universit{\"a}t Potsdam}, pages = {ix, 197}, year = {2023}, abstract = {Gene expression data is analyzed to identify biomarkers, e.g. relevant genes, which serve for diagnostic, predictive, or prognostic use. Traditional approaches for biomarker detection select distinctive features from the data based exclusively on the signals therein, facing multiple shortcomings in regards to overfitting, biomarker robustness, and actual biological relevance. Prior knowledge approaches are expected to address these issues by incorporating prior biological knowledge, e.g. on gene-disease associations, into the actual analysis. However, prior knowledge approaches are currently not widely applied in practice because they are often use-case specific and seldom applicable in a different scope. This leads to a lack of comparability of prior knowledge approaches, which in turn makes it currently impossible to assess their effectiveness in a broader context. Our work addresses the aforementioned issues with three contributions. Our first contribution provides formal definitions for both prior knowledge and the flexible integration thereof into the feature selection process. Central to these concepts is the automatic retrieval of prior knowledge from online knowledge bases, which allows for streamlining the retrieval process and agreeing on a uniform definition for prior knowledge. We subsequently describe novel and generalized prior knowledge approaches that are flexible regarding the used prior knowledge and applicable to varying use case domains. Our second contribution is the benchmarking platform Comprior. Comprior applies the aforementioned concepts in practice and allows for flexibly setting up comprehensive benchmarking studies for examining the performance of existing and novel prior knowledge approaches. It streamlines the retrieval of prior knowledge and allows for combining it with prior knowledge approaches. Comprior demonstrates the practical applicability of our concepts and further fosters the overall development and comparability of prior knowledge approaches. Our third contribution is a comprehensive case study on the effectiveness of prior knowledge approaches. For that, we used Comprior and tested a broad range of both traditional and prior knowledge approaches in combination with multiple knowledge bases on data sets from multiple disease domains. Ultimately, our case study constitutes a thorough assessment of a) the suitability of selected knowledge bases for integration, b) the impact of prior knowledge being applied at different integration levels, and c) the improvements in terms of classification performance, biological relevance, and overall robustness. In summary, our contributions demonstrate that generalized concepts for prior knowledge and a streamlined retrieval process improve the applicability of prior knowledge approaches. Results from our case study show that the integration of prior knowledge positively affects biomarker results, particularly regarding their robustness. Our findings provide the first in-depth insights on the effectiveness of prior knowledge approaches and build a valuable foundation for future research.}, language = {en} } @phdthesis{Kossmann2023, author = {Koßmann, Jan}, title = {Unsupervised database optimization}, doi = {10.25932/publishup-58949}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-589490}, school = {Universit{\"a}t Potsdam}, pages = {xi, 203}, year = {2023}, abstract = {The amount of data stored in databases and the complexity of database workloads are ever- increasing. Database management systems (DBMSs) offer many configuration options, such as index creation or unique constraints, which must be adapted to the specific instance to efficiently process large volumes of data. Currently, such database optimization is complicated, manual work performed by highly skilled database administrators (DBAs). In cloud scenarios, manual database optimization even becomes infeasible: it exceeds the abilities of the best DBAs due to the enormous number of deployed DBMS instances (some providers maintain millions of instances), missing domain knowledge resulting from data privacy requirements, and the complexity of the configuration tasks. Therefore, we investigate how to automate the configuration of DBMSs efficiently with the help of unsupervised database optimization. While there are numerous configuration options, in this thesis, we focus on automatic index selection and the use of data dependencies, such as functional dependencies, for query optimization. Both aspects have an extensive performance impact and complement each other by approaching unsupervised database optimization from different perspectives. Our contributions are as follows: (1) we survey automated state-of-the-art index selection algorithms regarding various criteria, e.g., their support for index interaction. We contribute an extensible platform for evaluating the performance of such algorithms with industry-standard datasets and workloads. The platform is well-received by the community and has led to follow-up research. With our platform, we derive the strengths and weaknesses of the investigated algorithms. We conclude that existing solutions often have scalability issues and cannot quickly determine (near-)optimal solutions for large problem instances. (2) To overcome these limitations, we present two new algorithms. Extend determines (near-)optimal solutions with an iterative heuristic. It identifies the best index configurations for the evaluated benchmarks. Its selection runtimes are up to 10 times lower compared with other near-optimal approaches. SWIRL is based on reinforcement learning and delivers solutions instantly. These solutions perform within 3 \% of the optimal ones. Extend and SWIRL are available as open-source implementations. (3) Our index selection efforts are complemented by a mechanism that analyzes workloads to determine data dependencies for query optimization in an unsupervised fashion. We describe and classify 58 query optimization techniques based on functional, order, and inclusion dependencies as well as on unique column combinations. The unsupervised mechanism and three optimization techniques are implemented in our open-source research DBMS Hyrise. Our approach reduces the Join Order Benchmark's runtime by 26 \% and accelerates some TPC-DS queries by up to 58 times. Additionally, we have developed a cockpit for unsupervised database optimization that allows interactive experiments to build confidence in such automated techniques. In summary, our contributions improve the performance of DBMSs, support DBAs in their work, and enable them to contribute their time to other, less arduous tasks.}, language = {en} } @phdthesis{Quinzan2023, author = {Quinzan, Francesco}, title = {Combinatorial problems and scalability in artificial intelligence}, doi = {10.25932/publishup-61111}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-611114}, school = {Universit{\"a}t Potsdam}, pages = {xi, 141}, year = {2023}, abstract = {Modern datasets often exhibit diverse, feature-rich, unstructured data, and they are massive in size. This is the case of social networks, human genome, and e-commerce databases. As Artificial Intelligence (AI) systems are increasingly used to detect pattern in data and predict future outcome, there are growing concerns on their ability to process large amounts of data. Motivated by these concerns, we study the problem of designing AI systems that are scalable to very large and heterogeneous data-sets. Many AI systems require to solve combinatorial optimization problems in their course of action. These optimization problems are typically NP-hard, and they may exhibit additional side constraints. However, the underlying objective functions often exhibit additional properties. These properties can be exploited to design suitable optimization algorithms. One of these properties is the well-studied notion of submodularity, which captures diminishing returns. Submodularity is often found in real-world applications. Furthermore, many relevant applications exhibit generalizations of this property. In this thesis, we propose new scalable optimization algorithms for combinatorial problems with diminishing returns. Specifically, we focus on three problems, the Maximum Entropy Sampling problem, Video Summarization, and Feature Selection. For each problem, we propose new algorithms that work at scale. These algorithms are based on a variety of techniques, such as forward step-wise selection and adaptive sampling. Our proposed algorithms yield strong approximation guarantees, and the perform well experimentally. We first study the Maximum Entropy Sampling problem. This problem consists of selecting a subset of random variables from a larger set, that maximize the entropy. By using diminishing return properties, we develop a simple forward step-wise selection optimization algorithm for this problem. Then, we study the problem of selecting a subset of frames, that represent a given video. Again, this problem corresponds to a submodular maximization problem. We provide a new adaptive sampling algorithm for this problem, suitable to handle the complex side constraints imposed by the application. We conclude by studying Feature Selection. In this case, the underlying objective functions generalize the notion of submodularity. We provide a new adaptive sequencing algorithm for this problem, based on the Orthogonal Matching Pursuit paradigm. Overall, we study practically relevant combinatorial problems, and we propose new algorithms to solve them. We demonstrate that these algorithms are suitable to handle massive datasets. However, our analysis is not problem-specific, and our results can be applied to other domains, if diminishing return properties hold. We hope that the flexibility of our framework inspires further research into scalability in AI.}, language = {en} } @phdthesis{Tan2023, author = {Tan, Jing}, title = {Multi-Agent Reinforcement Learning for Interactive Decision-Making}, doi = {10.25932/publishup-60700}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-607000}, school = {Universit{\"a}t Potsdam}, pages = {xii, 135}, year = {2023}, abstract = {Distributed decision-making studies the choices made among a group of interactive and self-interested agents. Specifically, this thesis is concerned with the optimal sequence of choices an agent makes as it tries to maximize its achievement on one or multiple objectives in the dynamic environment. The optimization of distributed decision-making is important in many real-life applications, e.g., resource allocation (of products, energy, bandwidth, computing power, etc.) and robotics (heterogeneous agent cooperation on games or tasks), in various fields such as vehicular network, Internet of Things, smart grid, etc. This thesis proposes three multi-agent reinforcement learning algorithms combined with game-theoretic tools to study strategic interaction between decision makers, using resource allocation in vehicular network as an example. Specifically, the thesis designs an interaction mechanism based on second-price auction, incentivizes the agents to maximize multiple short-term and long-term, individual and system objectives, and simulates a dynamic environment with realistic mobility data to evaluate algorithm performance and study agent behavior. Theoretical results show that the mechanism has Nash equilibria, is a maximization of social welfare and Pareto optimal allocation of resources in a stationary environment. Empirical results show that in the dynamic environment, our proposed learning algorithms outperform state-of-the-art algorithms in single and multi-objective optimization, and demonstrate very good generalization property in significantly different environments. Specifically, with the long-term multi-objective learning algorithm, we demonstrate that by considering the long-term impact of decisions, as well as by incentivizing the agents with a system fairness reward, the agents achieve better results in both individual and system objectives, even when their objectives are private, randomized, and changing over time. Moreover, the agents show competitive behavior to maximize individual payoff when resource is scarce, and cooperative behavior in achieving a system objective when resource is abundant; they also learn the rules of the game, without prior knowledge, to overcome disadvantages in initial parameters (e.g., a lower budget). To address practicality concerns, the thesis also provides several computational performance improvement methods, and tests the algorithm in a single-board computer. Results show the feasibility of online training and inference in milliseconds. There are many potential future topics following this work. 1) The interaction mechanism can be modified into a double-auction, eliminating the auctioneer, resembling a completely distributed, ad hoc network; 2) the objectives are assumed to be independent in this thesis, there may be a more realistic assumption regarding correlation between objectives, such as a hierarchy of objectives; 3) current work limits information-sharing between agents, the setup befits applications with privacy requirements or sparse signaling; by allowing more information-sharing between the agents, the algorithms can be modified for more cooperative scenarios such as robotics.}, language = {en} } @phdthesis{Doskoč2023, author = {Doskoč, Vanja}, title = {Mapping restrictions in behaviourally correct learning}, doi = {10.25932/publishup-59311}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-593110}, school = {Universit{\"a}t Potsdam}, pages = {ix, 74}, year = {2023}, abstract = {In this thesis, we investigate language learning in the formalisation of Gold [Gol67]. Here, a learner, being successively presented all information of a target language, conjectures which language it believes to be shown. Once these hypotheses converge syntactically to a correct explanation of the target language, the learning is considered successful. Fittingly, this is termed explanatory learning. To model learning strategies, we impose restrictions on the hypotheses made, for example requiring the conjectures to follow a monotonic behaviour. This way, we can study the impact a certain restriction has on learning. Recently, the literature shifted towards map charting. Here, various seemingly unrelated restrictions are contrasted, unveiling interesting relations between them. The results are then depicted in maps. For explanatory learning, the literature already provides maps of common restrictions for various forms of data presentation. In the case of behaviourally correct learning, where the learners are required to converge semantically instead of syntactically, the same restrictions as in explanatory learning have been investigated. However, a similarly complete picture regarding their interaction has not been presented yet. In this thesis, we transfer the map charting approach to behaviourally correct learning. In particular, we complete the partial results from the literature for many well-studied restrictions and provide full maps for behaviourally correct learning with different types of data presentation. We also study properties of learners assessed important in the literature. We are interested whether learners are consistent, that is, whether their conjectures include the data they are built on. While learners cannot be assumed consistent in explanatory learning, the opposite is the case in behaviourally correct learning. Even further, it is known that learners following different restrictions may be assumed consistent. We contribute to the literature by showing that this is the case for all studied restrictions. We also investigate mathematically interesting properties of learners. In particular, we are interested in whether learning under a given restriction may be done with strongly Bc-locking learners. Such learners are of particular value as they allow to apply simulation arguments when, for example, comparing two learning paradigms to each other. The literature gives a rich ground on when learners may be assumed strongly Bc-locking, which we complete for all studied restrictions.}, language = {en} } @phdthesis{Hagedorn2023, author = {Hagedorn, Christopher}, title = {Parallel execution of causal structure learning on graphics processing units}, doi = {10.25932/publishup-59758}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-597582}, school = {Universit{\"a}t Potsdam}, pages = {8, 192}, year = {2023}, abstract = {Learning the causal structures from observational data is an omnipresent challenge in data science. The amount of observational data available to Causal Structure Learning (CSL) algorithms is increasing as data is collected at high frequency from many data sources nowadays. While processing more data generally yields higher accuracy in CSL, the concomitant increase in the runtime of CSL algorithms hinders their widespread adoption in practice. CSL is a parallelizable problem. Existing parallel CSL algorithms address execution on multi-core Central Processing Units (CPUs) with dozens of compute cores. However, modern computing systems are often heterogeneous and equipped with Graphics Processing Units (GPUs) to accelerate computations. Typically, these GPUs provide several thousand compute cores for massively parallel data processing. To shorten the runtime of CSL algorithms, we design efficient execution strategies that leverage the parallel processing power of GPUs. Particularly, we derive GPU-accelerated variants of a well-known constraint-based CSL method, the PC algorithm, as it allows choosing a statistical Conditional Independence test (CI test) appropriate to the observational data characteristics. Our two main contributions are: (1) to reflect differences in the CI tests, we design three GPU-based variants of the PC algorithm tailored to CI tests that handle data with the following characteristics. We develop one variant for data assuming the Gaussian distribution model, one for discrete data, and another for mixed discrete-continuous data and data with non-linear relationships. Each variant is optimized for the appropriate CI test leveraging GPU hardware properties, such as shared or thread-local memory. Our GPU-accelerated variants outperform state-of-the-art parallel CPU-based algorithms by factors of up to 93.4× for data assuming the Gaussian distribution model, up to 54.3× for discrete data, up to 240× for continuous data with non-linear relationships and up to 655× for mixed discrete-continuous data. However, the proposed GPU-based variants are limited to datasets that fit into a single GPU's memory. (2) To overcome this shortcoming, we develop approaches to scale our GPU-based variants beyond a single GPU's memory capacity. For example, we design an out-of-core GPU variant that employs explicit memory management to process arbitrary-sized datasets. Runtime measurements on a large gene expression dataset reveal that our out-of-core GPU variant is 364 times faster than a parallel CPU-based CSL algorithm. Overall, our proposed GPU-accelerated variants speed up CSL in numerous settings to foster CSL's adoption in practice and research.}, language = {en} } @book{BarkowskyGiese2023, author = {Barkowsky, Matthias and Giese, Holger}, title = {Modular and incremental global model management with extended generalized discrimination networks}, number = {154}, isbn = {978-3-86956-555-2}, issn = {1613-5652}, doi = {10.25932/publishup-57396}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-573965}, publisher = {Universit{\"a}t Potsdam}, pages = {63 -- 63}, year = {2023}, abstract = {Complex projects developed under the model-driven engineering paradigm nowadays often involve several interrelated models, which are automatically processed via a multitude of model operations. Modular and incremental construction and execution of such networks of models and model operations are required to accommodate efficient development with potentially large-scale models. The underlying problem is also called Global Model Management. In this report, we propose an approach to modular and incremental Global Model Management via an extension to the existing technique of Generalized Discrimination Networks (GDNs). In addition to further generalizing the notion of query operations employed in GDNs, we adapt the previously query-only mechanism to operations with side effects to integrate model transformation and model synchronization. We provide incremental algorithms for the execution of the resulting extended Generalized Discrimination Networks (eGDNs), as well as a prototypical implementation for a number of example eGDN operations. Based on this prototypical implementation, we experiment with an application scenario from the software development domain to empirically evaluate our approach with respect to scalability and conceptually demonstrate its applicability in a typical scenario. Initial results confirm that the presented approach can indeed be employed to realize efficient Global Model Management in the considered scenario.}, language = {en} } @phdthesis{Shekhar2023, author = {Shekhar, Sumit}, title = {Image and video processing based on intrinsic attributes}, doi = {10.25932/publishup-62004}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-620049}, school = {Universit{\"a}t Potsdam}, pages = {xii, 143}, year = {2023}, abstract = {Advancements in computer vision techniques driven by machine learning have facilitated robust and efficient estimation of attributes such as depth, optical flow, albedo, and shading. To encapsulate all such underlying properties associated with images and videos, we evolve the concept of intrinsic images towards intrinsic attributes. Further, rapid hardware growth in the form of high-quality smartphone cameras, readily available depth sensors, mobile GPUs, or dedicated neural processing units have made image and video processing pervasive. In this thesis, we explore the synergies between the above two advancements and propose novel image and video processing techniques and systems based on them. To begin with, we investigate intrinsic image decomposition approaches and analyze how they can be implemented on mobile devices. We propose an approach that considers not only diffuse reflection but also specular reflection; it allows us to decompose an image into specularity, albedo, and shading on a resource constrained system (e.g., smartphones or tablets) using the depth data provided by the built-in depth sensors. In addition, we explore how on-device depth data can further be used to add an immersive dimension to 2D photos, e.g., showcasing parallax effects via 3D photography. In this regard, we develop a novel system for interactive 3D photo generation and stylization on mobile devices. Further, we investigate how adaptive manipulation of baseline-albedo (i.e., chromaticity) can be used for efficient visual enhancement under low-lighting conditions. The proposed technique allows for interactive editing of enhancement settings while achieving improved quality and performance. We analyze the inherent optical flow and temporal noise as intrinsic properties of a video. We further propose two new techniques for applying the above intrinsic attributes for the purpose of consistent video filtering. To this end, we investigate how to remove temporal inconsistencies perceived as flickering artifacts. One of the techniques does not require costly optical flow estimation, while both provide interactive consistency control. Using intrinsic attributes for image and video processing enables new solutions for mobile devices - a pervasive visual computing device - and will facilitate novel applications for Augmented Reality (AR), 3D photography, and video stylization. The proposed low-light enhancement techniques can also improve the accuracy of high-level computer vision tasks (e.g., face detection) under low-light conditions. Finally, our approach for consistent video filtering can extend a wide range of image-based processing for videos.}, language = {en} } @book{SchwarzerWeissSaoumiKitteletal.2023, author = {Schwarzer, Ingo and Weiß-Saoumi, Said and Kittel, Roland and Friedrich, Tobias and Kaynak, Koraltan and Durak, Cemil and Isbarn, Andreas and Diestel, J{\"o}rg and Knittel, Jens and Franz, Marquart and Morra, Carlos and Stahnke, Susanne and Braband, Jens and Dittmann, Johannes and Griebel, Stephan and Krampf, Andreas and Link, Martin and M{\"u}ller, Matthias and Radestock, Jens and Strub, Leo and Bleeke, Kai and Jehl, Leander and Kapitza, R{\"u}diger and Messadi, Ines and Schmidt, Stefan and Schwarz-R{\"u}sch, Signe and Pirl, Lukas and Schmid, Robert and Friedenberger, Dirk and Beilharz, Jossekin Jakob and Boockmeyer, Arne and Polze, Andreas and R{\"o}hrig, Ralf and Sch{\"a}be, Hendrik and Thiermann, Ricky}, title = {RailChain}, number = {152}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-550-7}, issn = {1613-5652}, doi = {10.25932/publishup-57740}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-577409}, publisher = {Universit{\"a}t Potsdam}, pages = {140}, year = {2023}, abstract = {The RailChain project designed, implemented, and experimentally evaluated a juridical recorder that is based on a distributed consensus protocol. That juridical blockchain recorder has been realized as distributed ledger on board the advanced TrainLab (ICE-TD 605 017) of Deutsche Bahn. For the project, a consortium consisting of DB Systel, Siemens, Siemens Mobility, the Hasso Plattner Institute for Digital Engineering, Technische Universit{\"a}t Braunschweig, T{\"U}V Rheinland InterTraffic, and Spherity has been formed. These partners not only concentrated competencies in railway operation, computer science, regulation, and approval, but also combined experiences from industry, research from academia, and enthusiasm from startups. Distributed ledger technologies (DLTs) define distributed databases and express a digital protocol for transactions between business partners without the need for a trusted intermediary. The implementation of a blockchain with real-time requirements for the local network of a railway system (e.g., interlocking or train) allows to log data in the distributed system verifiably in real-time. For this, railway-specific assumptions can be leveraged to make modifications to standard blockchains protocols. EULYNX and OCORA (Open CCS On-board Reference Architecture) are parts of a future European reference architecture for control command and signalling (CCS, Reference CCS Architecture - RCA). Both architectural concepts outline heterogeneous IT systems with components from multiple manufacturers. Such systems introduce novel challenges for the approved and safety-relevant CCS of railways which were considered neither for road-side nor for on-board systems so far. Logging implementations, such as the common juridical recorder on vehicles, can no longer be realized as a central component of a single manufacturer. All centralized approaches are in question. The research project RailChain is funded by the mFUND program and gives practical evidence that distributed consensus protocols are a proper means to immutably (for legal purposes) store state information of many system components from multiple manufacturers. The results of RailChain have been published, prototypically implemented, and experimentally evaluated in large-scale field tests on the advanced TrainLab. At the same time, the project showed how RailChain can be integrated into the road-side and on-board architecture given by OCORA and EULYNX. Logged data can now be analysed sooner and also their trustworthiness is being increased. This enables, e.g., auditable predictive maintenance, because it is ensured that data is authentic and unmodified at any point in time.}, language = {en} } @book{GarusSawahnWankeetal.2023, author = {Garus, Marcel and Sawahn, Rohan and Wanke, Jonas and Tiedt, Clemens and Granzow, Clara and Kuffner, Tim and Rosenbaum, Jannis and Hagemann, Linus and Wollnik, Tom and Woth, Lorenz and Auringer, Felix and Kantusch, Tobias and Roth, Felix and Hanff, Konrad and Schilli, Niklas and Seibold, Leonard and Lindner, Marc Fabian and Raschack, Selina}, title = {Operating systems II - student projects}, number = {142}, editor = {Grapentin, Andreas and Tiedt, Clemens and Polze, Andreas}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-524-8}, issn = {1613-5652}, doi = {10.25932/publishup-52636}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-526363}, publisher = {Universit{\"a}t Potsdam}, pages = {ix, 114}, year = {2023}, abstract = {This technical report presents the results of student projects which were prepared during the lecture "Operating Systems II" offered by the "Operating Systems and Middleware" group at HPI in the Summer term of 2020. The lecture covered ad- vanced aspects of operating system implementation and architecture on topics such as Virtualization, File Systems and Input/Output Systems. In addition to attending the lecture, the participating students were encouraged to gather practical experience by completing a project on a closely related topic over the course of the semester. The results of 10 selected exceptional projects are covered in this report. The students have completed hands-on projects on the topics of Operating System Design Concepts and Implementation, Hardware/Software Co-Design, Reverse Engineering, Quantum Computing, Static Source-Code Analysis, Operating Systems History, Application Binary Formats and more. It should be recognized that over the course of the semester all of these projects have achieved outstanding results which went far beyond the scope and the expec- tations of the lecture, and we would like to thank all participating students for their commitment and their effort in completing their respective projects, as well as their work on compiling this report.}, language = {en} } @article{ShlakaOuahibBerrada2023, author = {Shlaka, Souhad and Ouahib, Sara and Berrada, Khalid}, title = {A retrospective feedback of MOOCS in Morocco}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62482}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624826}, pages = {317 -- 327}, year = {2023}, abstract = {The integration of MOOCs into the Moroccan Higher Education (MHE) took place in 2013 by developing different partnerships and projects at national and international levels. As elsewhere, the Covid-19 crisis has played an important role in accelerating distance education in MHE. However, based on our experience as both university professors and specialists in educational engineering, the effective execution of the digital transition has not yet been implemented. Thus, in this article, we present a retrospective feedback of MOOCs in Morocco, focusing on the policies taken by the government to better support the digital transition in general and MOOCs in particular. We are therefore seeking to establish an optimal scenario for the promotion of MOOCs, which emphasizes the policies to be considered, and which recalls the importance of conducting a delicate articulation taking into account four levels, namely environmental, institutional, organizational and individual. We conclude with recommendations that are inspired by the Moroccan academic contex that focus on the major role that MOOCs plays for university students and on maintaining lifelong learning.}, language = {en} } @book{Weber2023, author = {Weber, Benedikt}, title = {Human pose estimation for decubitus prophylaxis}, number = {153}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-551-4}, issn = {1613-5652}, doi = {10.25932/publishup-56719}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-567196}, publisher = {Universit{\"a}t Potsdam}, pages = {73}, year = {2023}, abstract = {Decubitus is one of the most relevant diseases in nursing and the most expensive to treat. It is caused by sustained pressure on tissue, so it particularly affects bed-bound patients. This work lays a foundation for pressure mattress-based decubitus prophylaxis by implementing a solution to the single-frame 2D Human Pose Estimation problem. For this, methods of Deep Learning are employed. Two approaches are examined, a coarse-to-fine Convolutional Neural Network for direct regression of joint coordinates and a U-Net for the derivation of probability distribution heatmaps. We conclude that training our models on a combined dataset of the publicly available Bodies at Rest and SLP data yields the best results. Furthermore, various preprocessing techniques are investigated, and a hyperparameter optimization is performed to discover an improved model architecture. Another finding indicates that the heatmap-based approach outperforms direct regression. This model achieves a mean per-joint position error of 9.11 cm for the Bodies at Rest data and 7.43 cm for the SLP data. We find that it generalizes well on data from mattresses other than those seen during training but has difficulties detecting the arms correctly. Additionally, we give a brief overview of the medical data annotation tool annoto we developed in the bachelor project and furthermore conclude that the Scrum framework and agile practices enhanced our development workflow.}, language = {en} } @article{TheeraroungchaisriThammetarDuangchindaetal.2023, author = {Theeraroungchaisri, Anuchai and Thammetar, Thapanee and Duangchinda, Vorasuang and Khlaisang, Jintavee}, title = {Thai MOOC academy}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62421}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624212}, pages = {163 -- 169}, year = {2023}, abstract = {Thai MOOC Academy is a national digital learning platform that has been serving as a mechanism for promoting lifelong learning in Thailand since 2017. It has recently undergone significant improvements and upgrades, including the implementation of a credit bank system and a learner's eportfolio system interconnected with the platform. Thai MOOC Academy is introducing a national credit bank system for accreditation and management, which allows for the transfer of expected learning outcomes and educational qualifications between formal education, non-formal education, and informal education. The credit bank system has five distinct features, including issuing forgery-prevented certificates, recording learning results, transferring external credits within the same wallet, accumulating learning results, and creating a QR code for verification purposes. The paper discusses the features and future potential of Thai MOOC Academy, as it is extended towards a sandbox for the national credit bank system in Thailand.}, language = {en} } @book{MeinelGalbasHageboelling2023, author = {Meinel, Christoph and Galbas, Michael and Hageb{\"o}lling, David}, title = {Digitale Souver{\"a}nit{\"a}t: Erkenntnisse aus dem deutschen Bildungssektor}, number = {156}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-560-6}, issn = {1613-5652}, doi = {10.25932/publishup-59513}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-595138}, publisher = {Universit{\"a}t Potsdam}, pages = {1 -- 29}, year = {2023}, abstract = {Digitale Technologien bieten erhebliche politische, wirtschaftliche und gesellschaftliche Chancen. Zugleich ist der Begriff digitale Souver{\"a}nit{\"a}t zu einem Leitmotiv im deutschen Diskurs {\"u}ber digitale Technologien geworden: das heißt, die F{\"a}higkeit des Staates, seine Verantwortung wahrzunehmen und die Bef{\"a}higung der Gesellschaft - und des Einzelnen - sicherzustellen, die digitale Transformation selbstbestimmt zu gestalten. Exemplarisch f{\"u}r die Herausforderung in Deutschland und Europa, die Vorteile digitaler Technologien zu nutzen und gleichzeitig Souver{\"a}nit{\"a}tsbedenken zu ber{\"u}cksichtigen, steht der Bildungssektor. Er umfasst Bildung als zentrales {\"o}ffentliches Gut, ein schnell aufkommendes Gesch{\"a}ftsfeld und wachsende Best{\"a}nde an hochsensiblen personenbezogenen Daten. Davon ausgehend beschreibt der Bericht Wege zur Entsch{\"a}rfung des Spannungsverh{\"a}ltnisses zwischen Digitalisierung und Souver{\"a}nit{\"a}t auf drei verschiedenen Ebenen - Staat, Wirtschaft und Individuum - anhand konkreter technischer Projekte im Bildungsbereich: die HPI Schul-Cloud (staatliche Souver{\"a}nit{\"a}t), die MERLOT-Datenr{\"a}ume (wirtschaftliche Souver{\"a}nit{\"a}t) und die openHPI-Plattform (individuelle Souver{\"a}nit{\"a}t).}, language = {de} } @book{MeinelGalbasHageboelling2023, author = {Meinel, Christoph and Galbas, Michael and Hageb{\"o}lling, David}, title = {Digital sovereignty: insights from Germany's education sector}, number = {157}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-561-3}, issn = {1613-5652}, doi = {10.25932/publishup-59772}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-597723}, publisher = {Universit{\"a}t Potsdam}, pages = {1 -- 27}, year = {2023}, abstract = {Digital technology offers significant political, economic, and societal opportunities. At the same time, the notion of digital sovereignty has become a leitmotif in German discourse: the state's capacity to assume its responsibilities and safeguard society's - and individuals' - ability to shape the digital transformation in a self-determined way. The education sector is exemplary for the challenge faced by Germany, and indeed Europe, of harnessing the benefits of digital technology while navigating concerns around sovereignty. It encompasses education as a core public good, a rapidly growing field of business, and growing pools of highly sensitive personal data. The report describes pathways to mitigating the tension between digitalization and sovereignty at three different levels - state, economy, and individual - through the lens of concrete technical projects in the education sector: the HPI Schul-Cloud (state sovereignty), the MERLOT data spaces (economic sovereignty), and the openHPI platform (individual sovereignty).}, language = {en} } @article{MouraSantosCortiFelipeCoimbraCosta2023, author = {Moura Santos, Ana and Corti, Paola and Felipe Coimbra Costa, Luis}, title = {How to reuse inclusive stem Moocs in blended settings to engage young girls to scientific careers}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62475}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624756}, pages = {271 -- 278}, year = {2023}, abstract = {The FOSTWOM project (2019-2022), an ERASMUS+ funding, gave METID (Politecnico di Milano) and the MOOC T{\´e}cnico (Instituto Superior T{\´e}cnico, University of Lisbon), together with other partners, the opportunity to support the design and creation of gender-inclusive MOOCs. Among other project outputs, we designed a toolkit and a framework that enabled the production of two MOOCs for undergraduate and graduate students in Science, Technology, Engineering and Maths (STEM) and used them as academic content free of gender stereotypes about intellectual ability. In this short paper, the authors aim to 1) briefly share the main outputs of the project; 2) tell the story of how the FOSTWOM approach together with 3) a motivational strategy, the Heroine's Learning Journey, proved to be effective in the context of rural and marginal areas in Brazil, with young girls as a specific target audience.}, language = {en} } @article{ThienenWeinsteinMeinel2023, author = {Thienen, Julia von and Weinstein, Theresa Julia and Meinel, Christoph}, title = {Creative metacognition in design thinking}, series = {Frontiers in psychology}, volume = {14}, journal = {Frontiers in psychology}, publisher = {Frontiers Research Foundation}, address = {Lausanne}, issn = {1664-1078}, doi = {10.3389/fpsyg.2023.1157001}, pages = {20}, year = {2023}, abstract = {Design thinking is a well-established practical and educational approach to fostering high-level creativity and innovation, which has been refined since the 1950s with the participation of experts like Joy Paul Guilford and Abraham Maslow. Through real-world projects, trainees learn to optimize their creative outcomes by developing and practicing creative cognition and metacognition. This paper provides a holistic perspective on creativity, enabling the formulation of a comprehensive theoretical framework of creative metacognition. It focuses on the design thinking approach to creativity and explores the role of metacognition in four areas of creativity expertise: Products, Processes, People, and Places. The analysis includes task-outcome relationships (product metacognition), the monitoring of strategy effectiveness (process metacognition), an understanding of individual or group strengths and weaknesses (people metacognition), and an examination of the mutual impact between environments and creativity (place metacognition). It also reviews measures taken in design thinking education, including a distribution of cognition and metacognition, to support students in their development of creative mastery. On these grounds, we propose extended methods for measuring creative metacognition with the goal of enhancing comprehensive assessments of the phenomenon. Proposed methodological advancements include accuracy sub-scales, experimental tasks where examinees explore problem and solution spaces, combinations of naturalistic observations with capability testing, as well as physiological assessments as indirect measures of creative metacognition.}, language = {en} } @phdthesis{Podlesny2023, author = {Podlesny, Nikolai Jannik}, title = {Quasi-identifier discovery to prevent privacy violating inferences in large high dimensional datasets}, doi = {10.25932/publishup-58784}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-587843}, school = {Universit{\"a}t Potsdam}, pages = {xvi, 140}, year = {2023}, abstract = {Personal data privacy is considered to be a fundamental right. It forms a part of our highest ethical standards and is anchored in legislation and various best practices from the technical perspective. Yet, protecting against personal data exposure is a challenging problem from the perspective of generating privacy-preserving datasets to support machine learning and data mining operations. The issue is further compounded by the fact that devices such as consumer wearables and sensors track user behaviours on such a fine-grained level, thereby accelerating the formation of multi-attribute and large-scale high-dimensional datasets. In recent years, increasing news coverage regarding de-anonymisation incidents, including but not limited to the telecommunication, transportation, financial transaction, and healthcare sectors, have resulted in the exposure of sensitive private information. These incidents indicate that releasing privacy-preserving datasets requires serious consideration from the pre-processing perspective. A critical problem that appears in this regard is the time complexity issue in applying syntactic anonymisation methods, such as k-anonymity, l-diversity, or t-closeness to generating privacy-preserving data. Previous studies have shown that this problem is NP-hard. This thesis focuses on large high-dimensional datasets as an example of a special case of data that is characteristically challenging to anonymise using syntactic methods. In essence, large high-dimensional data contains a proportionately large number of attributes in proportion to the population of attribute values. Applying standard syntactic data anonymisation approaches to generating privacy-preserving data based on such methods results in high information-loss, thereby rendering the data useless for analytics operations or in low privacy due to inferences based on the data when information loss is minimised. We postulate that this problem can be resolved effectively by searching for and eliminating all the quasi-identifiers present in a high-dimensional dataset. Essentially, we quantify the privacy-preserving data sharing problem as the Find-QID problem. Further, we show that despite the complex nature of absolute privacy, the discovery of QID can be achieved reliably for large datasets. The risk of private data exposure through inferences can be circumvented, and both can be practicably achieved without the need for high-performance computers. For this purpose, we present, implement, and empirically assess both mathematical and engineering optimisation methods for a deterministic discovery of privacy-violating inferences. This includes a greedy search scheme by efficiently queuing QID candidates based on their tuple characteristics, projecting QIDs on Bayesian inferences, and countering Bayesian network's state-space-explosion with an aggregation strategy taken from multigrid context and vectorised GPU acceleration. Part of this work showcases magnitudes of processing acceleration, particularly in high dimensions. We even achieve near real-time runtime for currently impractical applications. At the same time, we demonstrate how such contributions could be abused to de-anonymise Kristine A. and Cameron R. in a public Twitter dataset addressing the US Presidential Election 2020. Finally, this work contributes, implements, and evaluates an extended and generalised version of the novel syntactic anonymisation methodology, attribute compartmentation. Attribute compartmentation promises sanitised datasets without remaining quasi-identifiers while minimising information loss. To prove its functionality in the real world, we partner with digital health experts to conduct a medical use case study. As part of the experiments, we illustrate that attribute compartmentation is suitable for everyday use and, as a positive side effect, even circumvents a common domain issue of base rate neglect.}, language = {en} } @article{SteinbeckMeinel2023, author = {Steinbeck, Hendrik and Meinel, Christoph}, title = {What makes an educational video?}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62208}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-622086}, pages = {47 -- 58}, year = {2023}, abstract = {In an effort to describe and produce different formats for video instruction, the research community in technology-enhanced learning, and MOOC scholars in particular, have focused on the general style of video production: whether it is a digitally scripted "talk-and-chalk" or a "talking head" version of a learning unit. Since these production styles include various sub-elements, this paper deconstructs the inherited elements of video production in the context of educational live-streams. Using over 700 videos - both from synchronous and asynchronous modalities of large video-based platforms (YouTube and Twitch), 92 features were found in eight categories of video production. These include commonly analyzed features such as the use of green screen and a visible instructor, but also less studied features such as social media connections and changing camera perspective depending on the topic being covered. Overall, the research results enable an analysis of common video production styles and a toolbox for categorizing new formats - independent of their final (a)synchronous use in MOOCs. Keywords: video production, MOOC video styles, live-streaming.}, language = {en} } @article{KristineJonsonCarlonYokoiMauriceGayedetal.2023, author = {Kristine Jonson Carlon, May and Yokoi, Kensuke and Maurice Gayed, John and Suyama, Hiroshi and Cross, Jeffrey}, title = {Preparing for Society 5.0 with MOOC Capabilities Extension}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Cross, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62080}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-620809}, pages = {9 -- 20}, year = {2023}, abstract = {Academia-industry collaborations are beneficial when both sides bring strengths to the partnership and the collaboration outcome is of mutual benefit. These types of collaboration projects are seen as a low-risk learning opportunity for both parties. In this paper, government initiatives that can change the business landscape and academia-industry collaborations that can provide upskilling opportunities to fill emerging business needs are discussed. In light of Japan's push for next-level modernization, a Japanese software company took a positive stance towards building new capabilities outside what it had been offering its customers. Consequently, an academic research group is laying out infrastructure for learning analytics research. An existing learning analytics dashboard was modularized to allow the research group to focus on natural language processing experiments while the software company explores a development framework suitable for data visualization techniques and artificial intelligence development. The results of this endeavor demonstrate that companies working with academia can creatively explore collaborations outside typical university-supported avenues.}, language = {en} } @article{Jin2023, author = {Jin, Tonje}, title = {"One video fit for all"}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62108}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-621080}, pages = {21 -- 35}, year = {2023}, abstract = {Online learning in mathematics has always been challenging, especially for mathematics in STEM education. This paper presents how to make "one fit for all" lecture videos for mathematics in STEM education. In general, we do believe that there is no such thing as "one fit for all" video. The curriculum requires a high level of prior knowledge in mathematics from high school to get a good understanding, and the variation of prior knowledge levels among STEM education students is often high. This creates challenges for both online teaching and on-campus teaching. This article presents experimenting and researching on a video format where students can get a real-time feeling, and which fits their needs regarding their existing prior knowledge. They have the possibility to ask and receive answers during the video without having to feel that they must jump into different sources, which helps to reduce unnecessary distractions. The fundamental video format presented here is that of dynamic branching videos, which has to little degree been researched in education related studies. The reason might be that this field is quite new for higher education, and there is relatively high requirement on the video editing skills from the teachers' side considering the platforms that are available so far. The videos are implemented for engineering students who take the Linear Algebra course at the Norwegian University of Science and Technology in spring 2023. Feedback from the students gathered via anonymous surveys so far (N = 21) is very positive. With the high suitability for online teaching, this video format might lead the trend of online learning in the future. The design and implementation of dynamic videos in mathematics in higher education was presented for the first time at the EMOOCs conference 2023.}, language = {en} } @article{XueBruillard2023, author = {Xue, Wei and Bruillard, {\´E}ric}, title = {MOOC in private Chinese universities}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62181}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-621811}, pages = {37 -- 45}, year = {2023}, abstract = {This paper investigates private university students' language learning activities in MOOC platforms and their attitude toward it. The study explores the development of MOOC use in Chinese private universities, with a focus on two modes: online et blended. We conducted empirical studies with students learning French and Japanese as a second foreign language, using questionnaires (N = 387) and interviews (N = 20) at a private university in Wuhan. Our results revealed that the majority of students used the MOOC platform more than twice a week and focused on the MOOC video, materials and assignments. However, we also found that students showed less interest in online communication (forums). Those who worked in the blended learning mode, especially Japanese learning students, had a more positive attitude toward MOOCs than other students.}, language = {en} } @article{vanEsvelddeVriesBecchettietal.2023, author = {van Esveld, Selma and de Vries, Nardo and Becchetti, Sibilla and Dopper, Sofia and van Valkenburg, Willem}, title = {Impact of Mooc and Other Online Course Development on Campus Education}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Cross, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62078}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-620785}, pages = {1 -- 8}, year = {2023}, abstract = {The TU Delft Extension School for Continuing Education develops and delivers MOOCs, programs and other online courses for lifelong learners and professionals worldwide focused on Science, Engineering \& Design. At the beginning of 2022, we started a project to examine whether creating an online course had any impact on TU Delft campus education. Through a survey, we collected feedback from 68 TU Delft lecturers involved in developing and offering online courses and programs for lifelong learners and professionals. The lecturers reported on the impact of developing an online course on a personal and curricular level. The results showed that the developed online materials, and the acquired skills and experiences from creating online courses, were beneficial for campus education, especially during the transition to remote emergency teaching in the COVID-19 lockdown periods. In this short paper, we will describe the responses in detail and map the benefits and challenges experienced by lecturers when implementing their online course materials and newly acquired educational skills on campus. Finally, we will explore future possibilities to extend the reported, already relevant, impact of MOOCs and of other online courses on campus education.}, language = {en} } @incollection{CorazzaThienen2023, author = {Corazza, Giovanni Emanuele and Thienen, Julia von}, title = {Invention}, series = {The Palgrave encyclopedia of the possible}, booktitle = {The Palgrave encyclopedia of the possible}, editor = {Glăveanu, Vlad Petre}, publisher = {Springer International Publishing}, address = {Cham}, isbn = {978-3-030-90912-3}, doi = {10.1007/978-3-030-90913-0_14}, pages = {806 -- 814}, year = {2023}, abstract = {This entry addresses invention from five different perspectives: (i) definition of the term, (ii) mechanisms underlying invention processes, (iii) (pre-)history of human inventions, (iv) intellectual property protection vs open innovation, and (v) case studies of great inventors. Regarding the definition, an invention is the outcome of a creative process taking place within a technological milieu, which is recognized as successful in terms of its effectiveness as an original technology. In the process of invention, a technological possibility becomes realized. Inventions are distinct from either discovery or innovation. In human creative processes, seven mechanisms of invention can be observed, yielding characteristic outcomes: (1) basic inventions, (2) invention branches, (3) invention combinations, (4) invention toolkits, (5) invention exaptations, (6) invention values, and (7) game-changing inventions. The development of humanity has been strongly shaped by inventions ever since early stone tools and the conception of agriculture. An "explosion of creativity" has been associated with Homo sapiens, and inventions in all fields of human endeavor have followed suit, engendering an exponential growth of cumulative culture. This culture development emerges essentially through a reuse of previous inventions, their revision, amendment and rededication. In sociocultural terms, humans have increasingly regulated processes of invention and invention-reuse through concepts such as intellectual property, patents, open innovation and licensing methods. Finally, three case studies of great inventors are considered: Edison, Marconi, and Montessori, next to a discussion of human invention processes as collaborative endeavors.}, language = {en} } @phdthesis{Traifeh2023, author = {Traifeh, Hanadi}, title = {Design Thinking in the Arab world}, doi = {10.25932/publishup-59891}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-598911}, school = {Universit{\"a}t Potsdam}, pages = {ix, 196}, year = {2023}, abstract = {Design Thinking is a human-centered approach to innovation that has become increasingly popular globally over the last decade. While the spread of Design Thinking is well understood and documented in the Western cultural contexts, particularly in Europe and the US due to the popularity of the Stanford-Potsdam Design Thinking education model, this is not the case when it comes to non-Western cultural contexts. This thesis fills a gap identified in the literature regarding how Design Thinking emerged, was perceived, adopted, and practiced in the Arab world. The culture in that part of the world differs from that of the Western context, which impacts the mindset of people and how they interact with Design Thinking tools and methods. A mixed-methods research approach was followed in which both quantitative and qualitative methods were employed. First, two methods were used in the quantitative phase: a social media analysis using Twitter as a source of data, and an online questionnaire. The results and analysis of the quantitative data informed the design of the qualitative phase in which two methods were employed: ten semi-structured interviews, and participant observation of seven Design Thinking training events. According to the analyzed data, the Arab world appears to have had an early, though relatively weak, and slow, adoption of Design Thinking since 2006. Increasing adoption, however, has been witnessed over the last decade, especially in Saudi Arabia, the United Arab Emirates and Egypt. The results also show that despite its limited spread, Design Thinking has been practiced the most in education, information technology and communication, administrative services, and the non-profit sectors. The way it is being practiced, though, is not fully aligned with how it is being practiced and taught in the US and Europe, as most people in the region do not necessarily believe in all mindset attributes introduced by the Stanford-Potsdam tradition. Practitioners in the Arab world also seem to shy away from the 'wild side' of Design Thinking in particular, and do not fully appreciate the connection between art-design, and science-engineering. This questions the role of the educational institutions in the region since -according to the findings- they appear to be leading the movement in promoting and developing Design Thinking in the Arab world. Nonetheless, it is notable that people seem to be aware of the positive impact of applying Design Thinking in the region, and its potential to bring meaningful transformation. However, they also seem to be concerned about the current cultural, social, political, and economic challenges that may challenge this transformation. Therefore, they call for more awareness and demand to create Arabic, culturally appropriate programs to respond to the local needs. On another note, the lack of Arabic content and local case studies on Design Thinking were identified by several interviewees and were also confirmed by the participant observation as major challenges that are slowing down the spread of Design Thinking or sometimes hampering capacity building in the region. Other challenges that were revealed by the study are: changing the mindset of people, the lack of dedicated Design Thinking spaces, and the need for clear instructions on how to apply Design Thinking methods and activities. The concept of time and how Arabs deal with it, gender management during trainings, and hierarchy and power dynamics among training participants are also among the identified challenges. Another key finding revealed by the study is the confirmation of التفكير التصميمي as the Arabic term to be most widely adopted in the region to refer to Design Thinking, since four other Arabic terms were found to be associated with Design Thinking. Based on the findings of the study, the thesis concludes by presenting a list of recommendations on how to overcome the mentioned challenges and what factors should be considered when designing and implementing culturally-customized Design Thinking training in the Arab region.}, language = {en} } @article{AlarioHoyosDelgadoKloosKiendletal.2023, author = {Alario Hoyos, Carlos and Delgado Kloos, Carlos and Kiendl, Doris and Terzieva, Liliya}, title = {Innovat MOOC}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62456}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624560}, pages = {229 -- 237}, year = {2023}, abstract = {The COVID-19 pandemic has revealed the importance for university teachers to have adequate pedagogical and technological competences to cope with the various possible educational scenarios (face-to-face, online, hybrid, etc.), making use of appropriate active learning methodologies and supporting technologies to foster a more effective learning environment. In this context, the InnovaT project has been an important initiative to support the development of pedagogical and technological competences of university teachers in Latin America through several trainings aiming to promote teacher innovation. These trainings combined synchronous online training through webinars and workshops with asynchronous online training through the MOOC "Innovative Teaching in Higher Education." This MOOC was released twice. The first run took place right during the lockdown of 2020, when Latin American teachers needed urgent training to move to emergency remote teaching overnight. The second run took place in 2022 with the return to face-to-face teaching and the implementation of hybrid educational models. This article shares the results of the design of the MOOC considering the constraints derived from the lockdowns applied in each country, the lessons learned from the delivery of such a MOOC to Latin American university teachers, and the results of the two runs of the MOOC.}, language = {en} } @article{XiaoxiaoShuangshuang2023, author = {Xiaoxiao, Wang and Shuangshuang, Guo}, title = {Promoting global higher education cooperation}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62386}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-623865}, pages = {85 -- 93}, year = {2023}, abstract = {The massive growth of MOOCs in 2011 laid the groundwork for the achievement of SDG 4. With the various benefits of MOOCs, there is also anticipation that online education should focus on more interactivity and global collaboration. In this context, the Global MOOC and Online Education Alliance (GMA) established a diverse group of 17 world-leading universities and three online education platforms from across 14 countries on all six continents in 2020. Through nearly three years of exploration, GMA has gained experience and achieved progress in fostering global cooperation in higher education. First, in joint teaching, GMA has promoted in-depth cooperation between members inside and outside the alliance. Examples include promoting the exchange of high-quality MOOCs, encouraging the creation of Global Hybrid Classroom, and launching Global Hybrid Classroom Certificate Programs. Second, in capacity building and knowledge sharing, GMA has launched Online Education Dialogues and the Global MOOC and Online Education Conference, inviting global experts to share best practices and attracting more than 10 million viewers around the world. Moreover, GMA is collaborating with international organizations to support teachers' professional growth, create an online learning community, and serve as a resource for further development. Third, in public advocacy, GMA has launched the SDG Hackathon and Global Massive Open Online Challenge (GMOOC) and attracted global learners to acquire knowledge and incubate their innovative ideas within a cross-cultural community to solve real-world problems that all humans face and jointly create a better future. Based on past experiences and challenges, GMA will explore more diverse cooperation models with more partners utilizing advanced technology, provide more support for digital transformation in higher education, and further promote global cooperation towards building a human community with a shared future.}, language = {en} } @article{Khaneboubi2023, author = {Khaneboubi, Mehdi}, title = {Visualizing students flows to monitor persistence}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62390}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-623906}, pages = {121 -- 131}, year = {2023}, abstract = {Founded in 2013, OpenClassrooms is a French online learning company that offers both paid courses and free MOOCs on a wide range of topics, including computer science and education. In 2021, in partnership with the EDA research unit, OpenClassrooms shared a database to solve the problem of how to increase persistence in their paid courses, which consist of a series of MOOCs and human mentoring. Our statistical analysis aims to identify reasons for dropouts that are due to the course design rather than demographic predictors or external factors.We aim to identify at-risk students, i.e. those who are on the verge of dropping out at a specific moment. To achieve this, we use learning analytics to characterize student behavior. We conducted data analysis on a sample of data related to the "Web Designers" and "Instructional Design" courses. By visualizing the student flow and constructing speed and acceleration predictors, we can identify which parts of the course need to be calibrated and when particular attention should be paid to these at-risk students.}, language = {en} } @article{DoğuOezdemirCanBayerMercanetal.2023, author = {Doğu {\"O}zdemir, Paker and Can Bayer, Burak and Mercan, Duygu and Buyurucu, Gamze}, title = {MOOC-based Personalized Learning Experience (Ple)}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62209}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-622098}, pages = {59 -- 66}, year = {2023}, abstract = {This qualitative study explores the impact of Personalized Learning Experience (PLE) courses at a higher education institution from the perspective of undergraduate students. The PLE program requires students to take at least one of their elective courses in the form of MOOCs during their undergraduate studies. Drawing on interviews with six students across different faculties, the study identified four key themes that encapsulate the effects of PLE courses: (1) Certificate driven learning with a focus on occupation skill enhancement, (2) diverse course offerings to enhance personal and academic development, (3) learning flexibility, and (4) student satisfaction. The findings suggest that PLE courses offered through MOOC platforms allow students to broaden their academic horizons, gain valuable skills, and tailor their education to better align with their interests and goals. Furthermore, this study highlights the potential benefits of incorporating PLE courses in higher education institutions, emphasizing their role in promoting a more dynamic and student-centered learning environment.}, language = {en} } @article{EbnerEdelsbrunnerHohlaSejkoraetal.2023, author = {Ebner, Martin and Edelsbrunner, Sarah and Hohla-Sejkora, Katharina and Lipp, Silvia and Sch{\"o}n, Sandra}, title = {Role of MOOCs and Imoox for Austrian Universities}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62213}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-622134}, pages = {77 -- 84}, year = {2023}, abstract = {This research paper provides an overview of the current state of MOOCs (massive open online courses) and universities in Austria, focusing on the national MOOC platform iMooX.at. The study begins by presenting the results of an analysis of the performance agreements of 22 Austrian public universities for the period 2022-2024, with a specific focus on the mention of MOOC activities and iMooX. The authors find that 12 of 22 (55 \%) Austrian public universities use at least one of these terms, indicating a growing interest in MOOCs and online learning. Additionally, the authors analyze internal documentation data to share insights into how many universities in Austria have produced and/or used a MOOC on the iMooX platform since its launch in 2014. These findings provide a valuable measure of the current usage and monitoring of MOOCs and iMooX among Austrian higher education institutions. Overall, this research contributes to a better understanding of the current state of MOOCs and their integration within Austrian higher education.}, language = {en} } @article{EgloffsteinHuenemohrIfenthaler2023, author = {Egloffstein, Marc and H{\"u}nemohr, Holger and Ifenthaler, Dirk}, title = {Modularization of open online courses on the eGov-Campus}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62388}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-623888}, pages = {105 -- 112}, year = {2023}, abstract = {Modularization describes the transformation of MOOCs from a comprehensive academic course format into smaller, more manageable learning offerings. It can be seen as one of the prerequisites for the successful implementation of MOOC-based micro-credentials in professional education and training. This short paper reports on the development and application of a modularization framework for Open Online Courses. Using the example of eGov-Campus, a German MOOC provider for the public sector linked to both academia and formal professional development, the structural specifications for modularized MOOC offerings and a methodology for course transformation as well as associated challenges in technology, organization and educational design are outlined. Following on from this, future prospects are discussed under the headings of individualization, certification and integration.}, language = {en} } @article{NeuboeckLinschinger2023, author = {Neub{\"o}ck, Kristina and Linschinger, Nadine}, title = {Central elements of knowledge and competence development with MOOCs}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62466}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624668}, pages = {255 -- 262}, year = {2023}, abstract = {To implement OERs at HEIs sustainably, not just technical infrastructure is required, but also well-trained staff. The University of Graz is in charge of an OER training program for university staff as part of the collaborative project Open Education Austria Advanced (OEAA) with the aim of ensuring long-term competence growth in the use and creation of OERs. The program consists of a MOOC and a guided blended learning format that was evaluated to find out which accompanying teaching and learning concepts can best facilitate targeted competence development. The evaluation of the program shows that learning videos, self-study assignments and synchronous sessions are most useful for the learning process. The results indicate that the creation of OERs is a complex process that can be undergone more effectively in the guided program.}, language = {en} } @article{KhlaisangDuangchindaThammetaretal.2023, author = {Khlaisang, Jintavee and Duangchinda, Vorasuang and Thammetar, Thapanee and Theeraroungchaisri, Anuchai}, title = {Instructional design for work-based skill MOOCs}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62431}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624318}, pages = {221 -- 227}, year = {2023}, abstract = {As Thailand moves towards becoming an innovation-driven economy, the need for human capital development has become crucial. Work-based skill MOOCs, offered on Thai MOOC, a national digital learning platform launched by Thailand Cyber University Project, ministry of Higher Education, Science, Research and Innovation, provide an effective way to overcome this challenge. This paper discusses the challenges faced in designing an instruction for work-based skill MOOCs that can serve as a foundation model for many more to come. The instructional design of work-based skill courses in Thai MOOC involves four simple steps, including course selection, learning from accredited providers, course requirements completion, and certification of acquired skills. The development of such courses is ongoing at the higher education level, vocational level, and pre-university level, which serve as a foundation model for many more work-based skill MOOC that will be offered on Thai MOOC soon. The instructional design of work-based skills courses should focus on the development of currently demanded professional competencies and skills, increasing the efficiency of work in the organization, creativity, and happiness in life that meets the human resources needs of industries in the 4.0 economy era in Thailand. This paper aims to present the challenges of designing instruction for work-based skill MOOCs and suggests effective ways to design instruction to enhance workforce development in Thailand.}, language = {en} } @article{DixonTrabucchi2023, author = {Dixon, Fred and Trabucchi, Stefania}, title = {Using analytics in a large virtual classroom for Open edX}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62389}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-623895}, pages = {113 -- 120}, year = {2023}, abstract = {The main aim of this article is to explore how learning analytics and synchronous collaboration could improve course completion and learner outcomes in MOOCs, which traditionally have been delivered asynchronously. Based on our experience with developing BigBlueButton, a virtual classroom platform that provides educators with live analytics, this paper explores three scenarios with business focused MOOCs to improve outcomes and strengthen learned skills.}, language = {en} } @article{NohrHaugsbakken2023, author = {Nohr, Magnus and Haugsbakken, Halvdan}, title = {A taxonomy of video genres as a scaffolding strategy for video making in education}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62429}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624294}, pages = {201 -- 220}, year = {2023}, abstract = {This research paper aims to introduce a novel practitioner-oriented and research-based taxonomy of video genres. This taxonomy can serve as a scaffolding strategy to support educators throughout the entire educational system in creating videos for pedagogical purposes. A taxonomy of video genres is essential as videos are highly valued resources among learners. Although the use of videos in education has been extensively researched and well-documented in systematic research reviews, gaps remain in the literature. Predominantly, researchers employ sophisticated quantitative methods and similar approaches to measure the performance of videos. This trend has led to the emergence of a strong learning analytics research tradition with its embedded literature. This body of research includes analysis of performance of videos in online courses such as Massive Open Online Courses (MOOCs). Surprisingly, this same literature is limited in terms of research outlining approaches to designing and creating educational videos, which applies to both video-based learning and online courses. This issue results in a knowledge gap, highlighting the need for developing pedagogical tools and strategies for video making. These can be found in frameworks, guidelines, and taxonomies, which can serve as scaffolding strategies. In contrast, there appears to be very few frameworks available for designing and creating videos for pedagogica purposes, apart from a few well-known frameworks. In this regard, this research paper proposes a novel taxonomy of video genres that educators can utilize when creating videos intended for use in either video-based learning environments or online courses. To create this taxonomy, a large number of videos from online courses were collected and analyzed using a mixed-method research design approach.}, language = {en} } @article{StaubitzSerthThomasetal.2023, author = {Staubitz, Thomas and Serth, Sebastian and Thomas, Max and Ebner, Martin and Koschutnig-Ebner, Markus and Rampelt, Florian and von Stetten, Alexander and Wittke, Andreas}, title = {A metastandard for the international exchange of MOOCs}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62415}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624154}, pages = {147 -- 161}, year = {2023}, abstract = {The MOOChub is a joined web-based catalog of all relevant German and Austrian MOOC platforms that lists well over 750 Massive Open Online Courses (MOOCs). Automatically building such a catalog requires that all partners describe and publicly offer the metadata of their courses in the same way. The paper at hand presents the genesis of the idea to establish a common metadata standard and the story of its subsequent development. The result of this effort is, first, an open-licensed de-facto-standard, which is based on existing commonly used standards and second, a first prototypical platform that is using this standard: the MOOChub, which lists all courses of the involved partners. This catalog is searchable and provides a more comprehensive overview of basically all MOOCs that are offered by German and Austrian MOOC platforms. Finally, the upcoming developments to further optimize the catalog and the metadata standard are reported.}, language = {en} } @article{GiannatelliTomasini2023, author = {Giannatelli, Ada and Tomasini, Alessandra}, title = {Descriptors and EU Standards to support the recognition of MOOCs}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62396}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-623967}, pages = {133 -- 146}, year = {2023}, abstract = {Digital technologies have enabled a variety of learning offers that opened new challenges in terms of recognition of formal, informal and non-formal learning, such as MOOCs. This paper focuses on how providing relevant data to describe a MOOC is conducive to increase the transparency of information and, ultimately, the flexibility of European higher education. The EU-funded project ECCOE took up these challenges and developed a solution by identifying the most relevant descriptors of a learning opportunity with a view to supporting a European system for micro-credentials. Descriptors indicate the specific properties of a learning opportunity according to European standards. They can provide a recognition framework also for small volumes of learning (micro-credentials) to support the integration of non-formal learning (MOOCs) into formal learning (e.g. institutional university courses) and to tackle skills shortage, upskilling and reskilling by acquiring relevant competencies. The focus on learning outcomes can facilitate the recognition of skills and competences of students and enhance both virtual and physical mobility and employability. This paper presents two contexts where ECCOE descriptors have been adopted: the Politecnico di Milano MOOC platform (Polimi Open Knowledge - POK), which is using these descriptors as the standard information to document the features of its learning opportunities, and the EU-funded Uforest project on urban forestry, which developed a blended training program for students of partner universities whose MOOCs used the ECCOE descriptors. Practice with ECCOE descriptors shows how they can be used not only to detail MOOC features, but also as a compass to design the learning offer. In addition, some rules of thumb can be derived and applied when using specific descriptors.}, language = {en} } @article{ConciaDistlerLawetal.2023, author = {Concia, Francesca and Distler, Petr and Law, Gareth and Macerata, Elena and Mariani, Mario and Mossini, Eros and Negrin, Maddalena and Štrok, Marko}, title = {An experience in developing models to use MOOCs in teaching and to advocate OERs}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62460}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624609}, pages = {239 -- 254}, year = {2023}, abstract = {Loss of expertise in the fields of Nuclear- and Radio-Chemistry (NRC) is problematic at a scientific and social level. This has been addressed by developing a MOOC, in order to let students in scientific matters discover all the benefits of NRC to society and improving their awareness of this discipline. The MOOC "Essential Radiochemistry for Society" includes current societal challenges related to health, clean and sustainable energy for safety and quality of food and agriculture. NRC teachers belonging to CINCH network were invited to use the MOOC in their teaching, according to various usage models: on the basis of these different experiences, some usage patterns were designed, describing context characteristics (number and age of students, course), activities' scheduling and organization, results and students' feedback, with the aim of encouraging the use of MOOCs in university teaching, as an opportunity for both lecturers and students. These models were the basis of a "toolkit for teachers". By experiencing digital teaching resources created by different lecturers, CINCH teachers took a first meaningful step towards understanding the worth of Open Educational Resources (OER) and the importance of their creation, adoption and sharing for knowledge progress. In this paper, the entire path from MOOC concept to MOOC different usage models, to awareness-raising regarding OER is traced in conceptual stages.}, language = {en} } @article{EbnerEdelsbrunnerHohlaSejkoraetal.2023, author = {Ebner, Martin and Edelsbrunner, Sarah and Hohla-Sejkora, Katharina and Mair, Bettina and Sch{\"o}n, Sandra and Lipp, Silvia and Steinkellner, Iris and Stojcevic, Ivana and Zwiauer, Charlotte}, title = {Impact assessment of a MOOC platform}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62422}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624222}, pages = {171 -- 186}, year = {2023}, abstract = {In 2020, the project "iMooX - The MOOC Platform as a Service for all Austrian Universities" was launched. It is co-financed by the Austrian Ministry of Education, Science and Research. After half of the funding period, the project management wants to assess and share results and outcomes but also address (potential) additional "impacts" of the MOOC platform. Building upon work on OER impact assessment, this contribution describes in detail how the specific iMooX.at approach of impact measurement was developed. Literature review, stakeholder analysis, and problem-based interviews were the base for developing a questionnaire addressing the defined key stakeholder "MOOC creators". The article also presents the survey results in English for the first time but focuses more on the development, strengths, and weaknesses of the selected methods. The article is seen as a contribution to the further development of impact assessment for MOOC platforms.}, language = {en} } @article{HaugsbakkenHagelia2023, author = {Haugsbakken, Halvdan and Hagelia, Marianne}, title = {An asynchronous cooperative leaning design in a Small Private Online Course (SPOC)}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62210}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-622107}, pages = {67 -- 76}, year = {2023}, abstract = {This short paper sets out to propose a novel and interesting learning design that facilitates for cooperative learning in which students do not conduct traditional group work in an asynchronous online education setting. This learning design will be explored in a Small Private Online Course (SPOC) among teachers and school managers at a teacher education. Such an approach can be made possible by applying specific criteria commonly used to define collaborative learning. Collaboration can be defined, among other things, as a structured way of working among students that includes elements of co-laboring. The cooperative learning design involves adapting various traditional collaborative learning approaches for use in an online learning environment. A critical component of this learning design is that students work on a self-defined case project related to their professional practices. Through an iterative process, students will receive ongoing feedback and formative assessments from instructors and follow students at specific points, meaning that co-constructing of knowledge and learning takes place as the SPOC progresses. This learning design can contribute to better learning experiences and outcomes for students, and be a valuable contribution to current research discussions on learning design in Massive Open Online Courses (MOOCs).}, language = {en} } @article{ThirouarddelaVillesbrunneBernaert2023, author = {Thirouard, Maria and de la Vill{\`e}sbrunne, Marie and Bernaert, Oliver}, title = {From MOOC to "2M-POC"}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62426}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624268}, pages = {187 -- 200}, year = {2023}, abstract = {IFP School develops and produces MOOCs since 2014. After the COVID-19 crisis, the demand of our industrial and international partners to offer continuous training to their employees increased drastically in an energy transition and sustainable mobility environment that finds itself in constant and rapid evolution. Therefore, it is time for a new format of digital learning tools to efficiently and rapidly train an important number of employees. To address this new demand, in a more and more digital learning environment, we have completely changed our initial MOOC model to propose an innovative SPOC business model mixing synchronous and asynchronous modules. This paper describes the work that has been done to transform our MOOCs to a hybrid SPOC model. We changed the format itself from a standard MOOC model of several weeks to small modules of one week average more adapted to our client's demand. We precisely engineered the exchanges between learners and the social aspect all along the SPOC duration. We propose a multimodal approach with a combination of asynchronous activities like online module, exercises, and synchronous activities like webinars with experts, and after-work sessions. Additionally, this new format increases the number of uses of the MOOC resources by our professors in our own master programs. With all these actions, we were able to reach a completion rate between 80 and 96\% - total enrolled -, compared to the completion rate of 15 to 28\% - total enrolled - as to be recorded in our original MOOC format. This is to be observed for small groups (50-100 learners) as SPOC but also for large groups (more than 2500 learners), as a Massive and Multimodal Private Online Course ("2M-POC"). Today a MOOC is not a simple assembly of videos, text, discussions forums and validation exercises but a complete multimodal learning path including social learning, personal followup, synchronous and asynchronous modules. We conclude that the original MOOC format is not at all suitable to propose efficient training to companies, and we must re-engineer the learning path to have a SPOC hybrid and multimodal training compatible with a cost-effective business model.}, language = {en} } @article{MoralesChanAmadoSalvatierraHernandezRizzardini2023, author = {Morales-Chan, Miguel and Amado-Salvatierra, H{\´e}ctor R. and Hern{\´a}ndez-Rizzardini, Rocael}, title = {Optimizing the design, pedagogical decision-making and development of MOOCs through the use of Ai-Based tools}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62387}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-623870}, pages = {95 -- 103}, year = {2023}, abstract = {This work explores the use of different generative AI tools in the design of MOOC courses. Authors in this experience employed a variety of AI-based tools, including natural language processing tools (e.g. Chat-GPT), and multimedia content authoring tools (e.g. DALLE-2, Midjourney, Tome.ai) to assist in the course design process. The aim was to address the unique challenges of MOOC course design, which includes to create engaging and effective content, to design interactive learning activities, and to assess student learning outcomes. The authors identified positive results with the incorporation of AI-based tools, which significantly improved the quality and effectiveness of MOOC course design. The tools proved particularly effective in analyzing and categorizing course content, identifying key learning objectives, and designing interactive learning activities that engaged students and facilitated learning. Moreover, the use of AI-based tools, streamlined the course design process, significantly reducing the time required to design and prepare the courses. In conclusion, the integration of generative AI tools into the MOOC course design process holds great potential for improving the quality and efficiency of these courses. Researchers and course designers should consider the advantages of incorporating generative AI tools into their design process to enhance their course offerings and facilitate student learning outcomes while also reducing the time and effort required for course development.}, language = {en} } @phdthesis{Santuber2023, author = {Santuber, Joaquin}, title = {Designing for digital justice}, doi = {10.25932/publishup-60417}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-604178}, school = {Universit{\"a}t Potsdam}, pages = {xviii, 183}, year = {2023}, abstract = {At the beginning of 2020, with COVID-19, courts of justice worldwide had to move online to continue providing judicial service. Digital technologies materialized the court practices in ways unthinkable shortly before the pandemic creating resonances with judicial and legal regulation, as well as frictions. A better understanding of the dynamics at play in the digitalization of courts is paramount for designing justice systems that serve their users better, ensure fair and timely dispute resolutions, and foster access to justice. Building on three major bodies of literature —e-justice, digitalization and organization studies, and design research— Designing for Digital Justice takes a nuanced approach to account for human and more-than-human agencies. Using a qualitative approach, I have studied in depth the digitalization of Chilean courts during the pandemic, specifically between April 2020 and September 2022. Leveraging a comprehensive source of primary and secondary data, I traced back the genealogy of the novel materializations of courts' practices structured by the possibilities offered by digital technologies. In five (5) cases studies, I show in detail how the courts got to 1) work remotely, 2) host hearings via videoconference, 3) engage with users via social media (i.e., Facebook and Chat Messenger), 4) broadcast a show with judges answering questions from users via Facebook Live, and 5) record, stream, and upload judicial hearings to YouTube to fulfil the publicity requirement of criminal hearings. The digitalization of courts during the pandemic is characterized by a suspended normativity, which makes innovation possible yet presents risks. While digital technologies enabled the judiciary to provide services continuously, they also created the risk of displacing traditional judicial and legal regulation. Contributing to liminal innovation and digitalization research, Designing for Digital Justice theorizes four phases: 1) the pre-digitalization phase resulting in the development of regulation, 2) the hotspot of digitalization resulting in the extension of regulation, 3) the digital innovation redeveloping regulation (moving to a new, preliminary phase), and 4) the permanence of temporal practices displacing regulation. Contributing to design research Designing for Digital Justice provides new possibilities for innovation in the courts, focusing at different levels to better address tensions generated by digitalization. Fellow researchers will find in these pages a sound theoretical advancement at the intersection of digitalization and justice with novel methodological references. Practitioners will benefit from the actionable governance framework Designing for Digital Justice Model, which provides three fields of possibilities for action to design better justice systems. Only by taking into account digital, legal, and social factors can we design better systems that promote access to justice, the rule of law, and, ultimately social peace.}, language = {en} } @phdthesis{Najafi2023, author = {Najafi, Pejman}, title = {Leveraging data science \& engineering for advanced security operations}, doi = {10.25932/publishup-61225}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-612257}, school = {Universit{\"a}t Potsdam}, pages = {xix, 180}, year = {2023}, abstract = {The Security Operations Center (SOC) represents a specialized unit responsible for managing security within enterprises. To aid in its responsibilities, the SOC relies heavily on a Security Information and Event Management (SIEM) system that functions as a centralized repository for all security-related data, providing a comprehensive view of the organization's security posture. Due to the ability to offer such insights, SIEMS are considered indispensable tools facilitating SOC functions, such as monitoring, threat detection, and incident response. Despite advancements in big data architectures and analytics, most SIEMs fall short of keeping pace. Architecturally, they function merely as log search engines, lacking the support for distributed large-scale analytics. Analytically, they rely on rule-based correlation, neglecting the adoption of more advanced data science and machine learning techniques. This thesis first proposes a blueprint for next-generation SIEM systems that emphasize distributed processing and multi-layered storage to enable data mining at a big data scale. Next, with the architectural support, it introduces two data mining approaches for advanced threat detection as part of SOC operations. First, a novel graph mining technique that formulates threat detection within the SIEM system as a large-scale graph mining and inference problem, built on the principles of guilt-by-association and exempt-by-reputation. The approach entails the construction of a Heterogeneous Information Network (HIN) that models shared characteristics and associations among entities extracted from SIEM-related events/logs. Thereon, a novel graph-based inference algorithm is used to infer a node's maliciousness score based on its associations with other entities in the HIN. Second, an innovative outlier detection technique that imitates a SOC analyst's reasoning process to find anomalies/outliers. The approach emphasizes explainability and simplicity, achieved by combining the output of simple context-aware univariate submodels that calculate an outlier score for each entry. Both approaches were tested in academic and real-world settings, demonstrating high performance when compared to other algorithms as well as practicality alongside a large enterprise's SIEM system. This thesis establishes the foundation for next-generation SIEM systems that can enhance today's SOCs and facilitate the transition from human-centric to data-driven security operations.}, language = {en} } @article{UtunenAttias2023, author = {Utunen, Heini and Attias, Melissa}, title = {xMOOCs}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62478}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624788}, pages = {279 -- 289}, year = {2023}, abstract = {The World Health Organization designed OpenWHO.org to provide an inclusive and accessible online environment to equip learners across the globe with critical up-to-date information and to be able to effectively protect themselves in health emergencies. The platform thus focuses on the eXtended Massive Open Online Course (xMOOC) modality - contentfocused and expert-driven, one-to-many modelled, and self-paced for scalable learning. In this paper, we describe how OpenWHO utilized xMOOCs to reach mass audiences during the COVID-19 pandemic; the paper specifically examines the accessibility, language inclusivity and adaptability of hosted xMOOCs. As of February 2023, OpenWHO had 7.5 million enrolments across 200 xMOOCs on health emergency, epidemic, pandemic and other public health topics available across 65 languages, including 46 courses targeted for the COVID-19 pandemic. Our results suggest that the xMOOC modality allowed OpenWHO to expand learning during the pandemic to previously underrepresented groups, including women, participants ages 70 and older, and learners younger than age 20. The OpenWHO use case shows that xMOOCs should be considered when there is a need for massive knowledge transfer in health emergency situations, yet the approach should be context-specific according to the type of health emergency, targeted population and region. Our evidence also supports previous calls to put intervention elements that contribute to removing barriers to access at the core of learning and health information dissemination. Equity must be the fundamental principle and organizing criteria for public health work.}, language = {en} } @article{ThomasStaubitzMeinel2023, author = {Thomas, Max and Staubitz, Thomas and Meinel, Christoph}, title = {Preparing MOOChub metadata for the future of online learning}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62483}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624830}, pages = {329 -- 338}, year = {2023}, abstract = {With the growing number of online learning resources, it becomes increasingly difficult and overwhelming to keep track of the latest developments and to find orientation in the plethora of offers. AI-driven services to recommend standalone learning resources or even complete learning paths are discussed as a possible solution for this challenge. To function properly, such services require a well-defined set of metadata provided by the learning resource. During the last few years, the so-called MOOChub metadata format has been established as a de-facto standard by a group of MOOC providers in German-speaking countries. This format, which is based on schema.org, already delivers a quite comprehensive set of metadata. So far, this set has been sufficient to list, display, sort, filter, and search for courses on several MOOC and open educational resources (OER) aggregators. AI recommendation services and further automated integration, beyond a plain listing, have special requirements, however. To optimize the format for proper support of such systems, several extensions and modifications have to be applied. We herein report on a set of suggested changes to prepare the format for this task.}, language = {en} } @article{DietzRoth2023, author = {Dietz, Michael and Roth, Dennis}, title = {Student-centered re-design of an online course with card sorting}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62484}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624843}, pages = {339 -- 350}, year = {2023}, abstract = {"How can a course structure be redesigned based on empirical data to enhance the learning effectiveness through a student-centered approach using objective criteria?", was the research question we asked. "Digital Twins for Virtual Commissioning of Production Machines" is a course using several innovative concepts including an in-depth practical part with online experiments, called virtual labs. The teaching-learning concept is continuously evaluated. Card Sorting is a popular method for designing information architectures (IA), "a practice of effectively organizing, structuring, and labeling the content of a website or application into a structuref that enables efficient navigation" [11]. In the presented higher education context, a so-called hybrid card sort was used, in which each participants had to sort 70 cards into seven predefined categories or create new categories themselves. Twelve out of 28 students voluntarily participated in the process and short interviews were conducted after the activity. The analysis of the category mapping creates a quantitative measure of the (dis-)similarity of the keywords in specific categories using hierarchical clustering (HCA). The learning designer could then interpret the results to make decisions about the number, labeling and order of sections in the course.}, language = {en} } @article{KennedyLaurillardZeitoun2023, author = {Kennedy, Eileen and Laurillard, Diana and Zeitoun, Samar}, title = {The Comooc model for global professional collaboration on sustainability}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62480}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624803}, pages = {291 -- 303}, year = {2023}, abstract = {This paper presents a new design for MOOCs for professional development of skills needed to meet the UN Sustainable Development Goals - the CoMOOC or Co-designed Massive Open Online Collaboration. The CoMOOC model is based on co-design with multiple stakeholders including end-users within the professional communities the CoMOOC aims to reach. This paper shows how the CoMOOC model could help the tertiary sector deliver on the UN Sustainable Development Goals (UNSDGs) - including but not limited to SDG 4 Education - by providing a more effective vehicle for professional development at a scale that the UNSDGs require. Interviews with professionals using MOOCs, and design-based research with professionals have informed the development of the Co-MOOC model. This research shows that open, online, collaborative learning experiences are highly effective for building professional community knowledge. Moreover, this research shows that the collaborative learning design at the heart of the CoMOOC model is feasible cross-platform Research with teachers working in crisis contexts in Lebanon, many of whom were refugees, will be presented to show how this form of large scale, co-designed, online learning can support professionals, even in the most challenging contexts, such as mass displacement, where expertise is urgently required.}, language = {en} } @article{WasilewskiKhaneboubiBruillard2023, author = {Wasilewski, Julie and Khaneboubi, Mehdi and Bruillard, {\´E}ric}, title = {How to detect At-Risk learners in professional finance MOOCs}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62481}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624818}, pages = {305 -- 316}, year = {2023}, abstract = {"Financial Analysis" is an online course designed for professionals consisting of three MOOCs, offering a professionally and institutionally recognized certificate in finance. The course is open but not free of charge and attracts mostly professionals from the banking industry. The primary objective of this study is to identify indicators that can predict learners at high risk of failure. To achieve this, we analyzed data from a previous course that had 875 enrolled learners and involve in the course during Fall 2021. We utilized correspondence analysis to examine demographic and behavioral variables. The initial results indicate that demographic factors have a minor impact on the risk of failure in comparison to learners' behaviors on the course platform. Two primary profiles were identified: (1) successful learners who utilized all the documents offered and spent between one to two hours per week, and (2) unsuccessful learners who used less than half of the proposed documents and spent less than one hour per week. Between these groups, at-risk students were identified as those who used more than half of the proposed documents and spent more than two hours per week. The goal is to identify those in group 1 who may be at risk of failing and those in group 2 who may succeed in the current MOOC, and to implement strategies to assist all learners in achieving success.}, language = {en} } @article{LorenzBockSchulteOstermann2023, author = {Lorenz, Anja and Bock, Stefanie and Schulte-Ostermann, Juleka}, title = {Challenges and proposals for introducing digital certificates in higher education infrastructures}, series = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, journal = {EMOOCs 2023 : Post-Covid Prospects for Massive Open Online Courses - Boost or Backlash?}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Scott, Jeffrey and Kristine Jonson Carlon, May and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-62470}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624701}, pages = {263 -- 270}, year = {2023}, abstract = {Questions about the recognition of MOOCs within and outside higher education were already being raised in the early 2010s. Today, recognition decisions are still made more or less on a case-by-case basis. However, digital certification approaches are now emerging that could automate recognition processes. The technical development of the required machinereadable documents and infrastructures is already well advanced in some cases. The DigiCerts consortium has developed a solution based on a collective blockchain. There are ongoing and open discussions regarding the particular technology, but the institutional implementation of digital certificates raises further questions. A number of workshops have been held at the Institute for Interactive Systems at Technische Hochschule L{\"u}beck, which have identified the need for new responsibilities for issuing certificates. It has also become clear that all members of higher education institutions need to develop skills in the use of digital certificates.}, language = {en} } @book{BarkowskyGiese2023, author = {Barkowsky, Matthias and Giese, Holger}, title = {Triple graph grammars for multi-version models}, number = {155}, isbn = {978-3-86956-556-9}, issn = {1613-5652}, doi = {10.25932/publishup-57399}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-573994}, publisher = {Universit{\"a}t Potsdam}, pages = {28 -- 28}, year = {2023}, abstract = {Like conventional software projects, projects in model-driven software engineering require adequate management of multiple versions of development artifacts, importantly allowing living with temporary inconsistencies. In the case of model-driven software engineering, employed versioning approaches also have to handle situations where different artifacts, that is, different models, are linked via automatic model transformations. In this report, we propose a technique for jointly handling the transformation of multiple versions of a source model into corresponding versions of a target model, which enables the use of a more compact representation that may afford improved execution time of both the transformation and further analysis operations. Our approach is based on the well-known formalism of triple graph grammars and a previously introduced encoding of model version histories called multi-version models. In addition to showing the correctness of our approach with respect to the standard semantics of triple graph grammars, we conduct an empirical evaluation that demonstrates the potential benefit regarding execution time performance.}, language = {en} } @inproceedings{EsveldVriesBecchettietal.2023, author = {Esveld, Selma van and Vries, Nardo de and Becchetti, Sibilla and Dopper, Sofia and Valkenburg, Willem van and Carlon, May Kristine Jonson and Yokoi, Kensuke and Gayed, John Maurice and Suyama, Hiroshi and Cross, Jeffrey Scott and Jin, Tonje and Xue, Wei and Bruillard, {\´E}ric and Steinbeck, Hendrik and Meinel, Christoph and {\"O}zdemir, Paker Doğu and Can Bayer, Burak and Mercan, Duygu and Buyurucu, Gamze and Haugsbakken, Halvdan and Hagelia, Marianne and Ebner, Martin and Edelsbrunner, Sarah and Hohla-Sejkora, Katharina and Lipp, Silvia and Sch{\"o}n, Sandra and Xiaoxiao, Wang and Shuangshuang, Guo and Morales-Chan, Miguel and Amado-Salvatierra, H{\´e}ctor R. and Hern{\´a}ndez-Rizzardini, Rocael and Egloffstein, Marc and H{\"u}nemohr, Holger and Ifenthaler, Dirk and Dixon, Fred and Trabucchi, Stefania and Khaneboubi, Mehdi and Giannatelli, Ada and Tomasini, Alessandra and Staubitz, Thomas and Serth, Sebastian and Thomas, Max and Koschutnig-Ebner, Markus and Rampelt, Florian and Stetten, Alexander von and Wittke, Andreas and Theeraroungchaisri, Anuchai and Thammetar, Thapanee and Duangchinda, Vorasuang and Khlaisang, Jintavee and Mair, Bettina and Steinkellner, Iris and Stojcevic, Ivana and Zwiauer, Charlotte and Thirouard, Maria and Vill{\`e}sbrunne, Marie de la and Bernaert, Oliver and Nohr, Magnus and Alario Hoyos, Carlos and Delgado Kloos, Carlos and Kiendl, Doris and Terzieva, Liliya and Concia, Francesca and Distler, Petr and Law, Gareth and Macerata, Elena and Mariani, Mario and Mossini, Eros and Negrin, Maddalena and Štrok, Marko and Neub{\"o}ck, Kristina and Linschinger, Nadine and Lorenz, Anja and Bock, Stefanie and Schulte-Ostermann, Juleka and Moura Santos, Ana and Corti, Paola and Costa, Luis Felipe Coimbra and Utunen, Heini and Attias, Melissa and Tokar, Anna and Kennedy, Eileen and Laurillard, Diana and Zeitoun, Samar and Wasilewski, Julie and Shlaka, Souhad and Ouahib, Sara and Berrada, Khalid and Dietz, Michael and Roth, Dennis}, title = {EMOOCs 2023}, editor = {Meinel, Christoph and Schweiger, Stefanie and Staubitz, Thomas and Conrad, Robert and Alario Hoyos, Carlos and Ebner, Martin and Sancassani, Susanna and Żur, Agnieszka and Friedl, Christian and Halawa, Sherif and Gamage, Dilrukshi and Cross, Jeffrey and Jonson Carlon, May Kristine and Deville, Yves and Gaebel, Michael and Delgado Kloos, Carlos and von Schmieden, Karen}, doi = {10.25932/publishup-57645}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-576450}, year = {2023}, abstract = {From June 14 to June 16, 2023, Hasso Plattner Institute, Potsdam, hosted the eighth European MOOC Stakeholder Summit (EMOOCs 2023). The pandemic is fortunately over. It has once again shown how important digital education is. How well-prepared a country was could be seen in our schools, universities, and companies. In different countries, the problems manifested themselves differently. The measures and approaches to solving the problems varied accordingly. Digital education, whether micro-credentials, MOOCs, blended learning formats, or other e-learning tools, received a major boost. EMOOCs 2023 focusses on the effects of this emergency situation. How has it affected the development and delivery of MOOCs and other e-learning offerings all over Europe? Which projects can serve as models for successful digital learning and teaching? Which roles can MOOCs and micro-credentials bear in the current business transformation? Is there a backlash to the routine we knew from pre-Corona times? Or have many things become firmly established in the meantime, e.g. remote work, hybrid conferences, etc.? Furthermore, EMOOCs 2023 has a closer look at the development and formalization of digital learning. Micro-credentials are just the starting point. Further steps in this direction would be complete online study programs or full online universities. Another main topic is the networking of learning offers and the standardization of formats and metadata. Examples of fruitful cooperations are the MOOChub, the European MOOC Consortium, and the Common Micro-Credential Framework. The learnings, derived from practical experience and research, are explored in EMOOCs 2023 in four tracks and additional workshops, covering various aspects of this field. In this publication, we present papers from the conference's Research \& Experience Track, the Business Track and the International Track.}, language = {en} } @article{PuriVardeMelo2023, author = {Puri, Manish and Varde, Aparna S. and Melo, Gerard de}, title = {Commonsense based text mining on urban policy}, series = {Language resources and evaluation}, volume = {57}, journal = {Language resources and evaluation}, publisher = {Springer}, address = {Dordrecht [u.a.]}, issn = {1574-020X}, doi = {10.1007/s10579-022-09584-6}, pages = {733 -- 763}, year = {2023}, abstract = {Local laws on urban policy, i.e., ordinances directly affect our daily life in various ways (health, business etc.), yet in practice, for many citizens they remain impervious and complex. This article focuses on an approach to make urban policy more accessible and comprehensible to the general public and to government officials, while also addressing pertinent social media postings. Due to the intricacies of the natural language, ranging from complex legalese in ordinances to informal lingo in tweets, it is practical to harness human judgment here. To this end, we mine ordinances and tweets via reasoning based on commonsense knowledge so as to better account for pragmatics and semantics in the text. Ours is pioneering work in ordinance mining, and thus there is no prior labeled training data available for learning. This gap is filled by commonsense knowledge, a prudent choice in situations involving a lack of adequate training data. The ordinance mining can be beneficial to the public in fathoming policies and to officials in assessing policy effectiveness based on public reactions. This work contributes to smart governance, leveraging transparency in governing processes via public involvement. We focus significantly on ordinances contributing to smart cities, hence an important goal is to assess how well an urban region heads towards a smart city as per its policies mapping with smart city characteristics, and the corresponding public satisfaction.}, language = {en} } @article{HagedornSerthMeinel2023, author = {Hagedorn, Christiane and Serth, Sebastian and Meinel, Christoph}, title = {The mysterious adventures of Detective Duke}, series = {Frontiers in education}, volume = {7}, journal = {Frontiers in education}, publisher = {Frontiers Media}, address = {Lausanne}, issn = {2504-284X}, doi = {10.3389/feduc.2022.1016401}, pages = {22}, year = {2023}, abstract = {About 15 years ago, the first Massive Open Online Courses (MOOCs) appeared and revolutionized online education with more interactive and engaging course designs. Yet, keeping learners motivated and ensuring high satisfaction is one of the challenges today's course designers face. Therefore, many MOOC providers employed gamification elements that only boost extrinsic motivation briefly and are limited to platform support. In this article, we introduce and evaluate a gameful learning design we used in several iterations on computer science education courses. For each of the courses on the fundamentals of the Java programming language, we developed a self-contained, continuous story that accompanies learners through their learning journey and helps visualize key concepts. Furthermore, we share our approach to creating the surrounding story in our MOOCs and provide a guideline for educators to develop their own stories. Our data and the long-term evaluation spanning over four Java courses between 2017 and 2021 indicates the openness of learners toward storified programming courses in general and highlights those elements that had the highest impact. While only a few learners did not like the story at all, most learners consumed the additional story elements we provided. However, learners' interest in influencing the story through majority voting was negligible and did not show a considerable positive impact, so we continued with a fixed story instead. We did not find evidence that learners just participated in the narrative because they worked on all materials. Instead, for 10-16\% of learners, the story was their main course motivation. We also investigated differences in the presentation format and concluded that several longer audio-book style videos were most preferred by learners in comparison to animated videos or different textual formats. Surprisingly, the availability of a coherent story embedding examples and providing a context for the practical programming exercises also led to a slightly higher ranking in the perceived quality of the learning material (by 4\%). With our research in the context of storified MOOCs, we advance gameful learning designs, foster learner engagement and satisfaction in online courses, and help educators ease knowledge transfer for their learners.}, language = {en} } @article{HeckerSteckhanEybenetal.2022, author = {Hecker, Pascal and Steckhan, Nico and Eyben, Florian and Schuller, Bj{\"o}rn Wolfgang and Arnrich, Bert}, title = {Voice Analysis for Neurological Disorder Recognition - A Systematic Review and Perspective on Emerging Trends}, series = {Frontiers in Digital Health}, journal = {Frontiers in Digital Health}, publisher = {Frontiers Media SA}, address = {Lausanne, Schweiz}, issn = {2673-253X}, doi = {10.3389/fdgth.2022.842301}, pages = {16}, year = {2022}, abstract = {Quantifying neurological disorders from voice is a rapidly growing field of research and holds promise for unobtrusive and large-scale disorder monitoring. The data recording setup and data analysis pipelines are both crucial aspects to effectively obtain relevant information from participants. Therefore, we performed a systematic review to provide a high-level overview of practices across various neurological disorders and highlight emerging trends. PRISMA-based literature searches were conducted through PubMed, Web of Science, and IEEE Xplore to identify publications in which original (i.e., newly recorded) datasets were collected. Disorders of interest were psychiatric as well as neurodegenerative disorders, such as bipolar disorder, depression, and stress, as well as amyotrophic lateral sclerosis amyotrophic lateral sclerosis, Alzheimer's, and Parkinson's disease, and speech impairments (aphasia, dysarthria, and dysphonia). Of the 43 retrieved studies, Parkinson's disease is represented most prominently with 19 discovered datasets. Free speech and read speech tasks are most commonly used across disorders. Besides popular feature extraction toolkits, many studies utilise custom-built feature sets. Correlations of acoustic features with psychiatric and neurodegenerative disorders are presented. In terms of analysis, statistical analysis for significance of individual features is commonly used, as well as predictive modeling approaches, especially with support vector machines and a small number of artificial neural networks. An emerging trend and recommendation for future studies is to collect data in everyday life to facilitate longitudinal data collection and to capture the behavior of participants more naturally. Another emerging trend is to record additional modalities to voice, which can potentially increase analytical performance.}, language = {en} } @article{ZieglerPfitznerSchulzetal.2022, author = {Ziegler, Joceline and Pfitzner, Bjarne and Schulz, Heinrich and Saalbach, Axel and Arnrich, Bert}, title = {Defending against Reconstruction Attacks through Differentially Private Federated Learning for Classification of Heterogeneous Chest X-ray Data}, series = {Sensors}, volume = {22}, journal = {Sensors}, edition = {14}, publisher = {MDPI}, address = {Basel, Schweiz}, issn = {1424-8220}, doi = {10.3390/s22145195}, pages = {25}, year = {2022}, abstract = {Privacy regulations and the physical distribution of heterogeneous data are often primary concerns for the development of deep learning models in a medical context. This paper evaluates the feasibility of differentially private federated learning for chest X-ray classification as a defense against data privacy attacks. To the best of our knowledge, we are the first to directly compare the impact of differentially private training on two different neural network architectures, DenseNet121 and ResNet50. Extending the federated learning environments previously analyzed in terms of privacy, we simulated a heterogeneous and imbalanced federated setting by distributing images from the public CheXpert and Mendeley chest X-ray datasets unevenly among 36 clients. Both non-private baseline models achieved an area under the receiver operating characteristic curve (AUC) of 0.940.94 on the binary classification task of detecting the presence of a medical finding. We demonstrate that both model architectures are vulnerable to privacy violation by applying image reconstruction attacks to local model updates from individual clients. The attack was particularly successful during later training stages. To mitigate the risk of a privacy breach, we integrated R{\´e}nyi differential privacy with a Gaussian noise mechanism into local model training. We evaluate model performance and attack vulnerability for privacy budgets ε∈{1,3,6,10}�∈{1,3,6,10}. The DenseNet121 achieved the best utility-privacy trade-off with an AUC of 0.940.94 for ε=6�=6. Model performance deteriorated slightly for individual clients compared to the non-private baseline. The ResNet50 only reached an AUC of 0.760.76 in the same privacy setting. Its performance was inferior to that of the DenseNet121 for all considered privacy constraints, suggesting that the DenseNet121 architecture is more robust to differentially private training.}, language = {en} } @article{FehrJaramilloGutierrezOalaetal.2022, author = {Fehr, Jana and Jaramillo-Gutierrez, Giovanna and Oala, Luis and Gr{\"o}schel, Matthias I. and Bierwirth, Manuel and Balachandran, Pradeep and Werneck-Leite, Alixandro and Lippert, Christoph}, title = {Piloting a Survey-Based Assessment of Transparency and Trustworthiness with Three Medical AI Tools}, series = {Healthcare}, volume = {10}, journal = {Healthcare}, number = {10}, publisher = {MDPI}, address = {Basel, Schweiz}, issn = {2227-9032}, doi = {10.3390/healthcare10101923}, pages = {30}, year = {2022}, abstract = {Artificial intelligence (AI) offers the potential to support healthcare delivery, but poorly trained or validated algorithms bear risks of harm. Ethical guidelines stated transparency about model development and validation as a requirement for trustworthy AI. Abundant guidance exists to provide transparency through reporting, but poorly reported medical AI tools are common. To close this transparency gap, we developed and piloted a framework to quantify the transparency of medical AI tools with three use cases. Our framework comprises a survey to report on the intended use, training and validation data and processes, ethical considerations, and deployment recommendations. The transparency of each response was scored with either 0, 0.5, or 1 to reflect if the requested information was not, partially, or fully provided. Additionally, we assessed on an analogous three-point scale if the provided responses fulfilled the transparency requirement for a set of trustworthiness criteria from ethical guidelines. The degree of transparency and trustworthiness was calculated on a scale from 0\% to 100\%. Our assessment of three medical AI use cases pin-pointed reporting gaps and resulted in transparency scores of 67\% for two use cases and one with 59\%. We report anecdotal evidence that business constraints and limited information from external datasets were major obstacles to providing transparency for the three use cases. The observed transparency gaps also lowered the degree of trustworthiness, indicating compliance gaps with ethical guidelines. All three pilot use cases faced challenges to provide transparency about medical AI tools, but more studies are needed to investigate those in the wider medical AI sector. Applying this framework for an external assessment of transparency may be infeasible if business constraints prevent the disclosure of information. New strategies may be necessary to enable audits of medical AI tools while preserving business secrets.}, language = {en} } @phdthesis{Repke2022, author = {Repke, Tim}, title = {Machine-learning-assisted corpus exploration and visualisation}, doi = {10.25932/publishup-56263}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-562636}, school = {Universit{\"a}t Potsdam}, pages = {xii, 131}, year = {2022}, abstract = {Text collections, such as corpora of books, research articles, news, or business documents are an important resource for knowledge discovery. Exploring large document collections by hand is a cumbersome but necessary task to gain new insights and find relevant information. Our digitised society allows us to utilise algorithms to support the information seeking process, for example with the help of retrieval or recommender systems. However, these systems only provide selective views of the data and require some prior knowledge to issue meaningful queries and asses a system's response. The advancements of machine learning allow us to reduce this gap and better assist the information seeking process. For example, instead of sighting countless business documents by hand, journalists and investigator scan employ natural language processing techniques, such as named entity recognition. Al-though this greatly improves the capabilities of a data exploration platform, the wealth of information is still overwhelming. An overview of the entirety of a dataset in the form of a two-dimensional map-like visualisation may help to circumvent this issue. Such overviews enable novel interaction paradigms for users, which are similar to the exploration of digital geographical maps. In particular, they can provide valuable context by indicating how apiece of information fits into the bigger picture.This thesis proposes algorithms that appropriately pre-process heterogeneous documents and compute the layout for datasets of all kinds. Traditionally, given high-dimensional semantic representations of the data, so-called dimensionality reduction algorithms are usedto compute a layout of the data on a two-dimensional canvas. In this thesis, we focus on text corpora and go beyond only projecting the inherent semantic structure itself. Therefore,we propose three dimensionality reduction approaches that incorporate additional information into the layout process: (1) a multi-objective dimensionality reduction algorithm to jointly visualise semantic information with inherent network information derived from the underlying data; (2) a comparison of initialisation strategies for different dimensionality reduction algorithms to generate a series of layouts for corpora that grow and evolve overtime; (3) and an algorithm that updates existing layouts by incorporating user feedback provided by pointwise drag-and-drop edits. This thesis also contains system prototypes to demonstrate the proposed technologies, including pre-processing and layout of the data and presentation in interactive user interfaces.}, language = {en} } @phdthesis{Jiang2022, author = {Jiang, Lan}, title = {Discovering metadata in data files}, doi = {10.25932/publishup-56620}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-566204}, school = {Universit{\"a}t Potsdam}, pages = {x, ii, 117}, year = {2022}, abstract = {It is estimated that data scientists spend up to 80\% of the time exploring, cleaning, and transforming their data. A major reason for that expenditure is the lack of knowledge about the used data, which are often from different sources and have heterogeneous structures. As a means to describe various properties of data, metadata can help data scientists understand and prepare their data, saving time for innovative and valuable data analytics. However, metadata do not always exist: some data file formats are not capable of storing them; metadata were deleted for privacy concerns; legacy data may have been produced by systems that were not designed to store and handle meta- data. As data are being produced at an unprecedentedly fast pace and stored in diverse formats, manually creating metadata is not only impractical but also error-prone, demanding automatic approaches for metadata detection. In this thesis, we are focused on detecting metadata in CSV files - a type of plain-text file that, similar to spreadsheets, may contain different types of content at arbitrary positions. We propose a taxonomy of metadata in CSV files and specifically address the discovery of three different metadata: line and cell type, aggregations, and primary keys and foreign keys. Data are organized in an ad-hoc manner in CSV files, and do not follow a fixed structure, which is assumed by common data processing tools. Detecting the structure of such files is a prerequisite of extracting information from them, which can be addressed by detecting the semantic type, such as header, data, derived, or footnote, of each line or each cell. We propose the supervised- learning approach Strudel to detect the type of lines and cells. CSV files may also include aggregations. An aggregation represents the arithmetic relationship between a numeric cell and a set of other numeric cells. Our proposed AggreCol algorithm is capable of detecting aggregations of five arithmetic functions in CSV files. Note that stylistic features, such as font style and cell background color, do not exist in CSV files. Our proposed algorithms address the respective problems by using only content, contextual, and computational features. Storing a relational table is also a common usage of CSV files. Primary keys and foreign keys are important metadata for relational databases, which are usually not present for database instances dumped as plain-text files. We propose the HoPF algorithm to holistically detect both constraints in relational databases. Our approach is capable of distinguishing true primary and foreign keys from a great amount of spurious unique column combinations and inclusion dependencies, which can be detected by state-of-the-art data profiling algorithms.}, language = {en} } @misc{ZieglerPfitznerSchulzetal.2022, author = {Ziegler, Joceline and Pfitzner, Bjarne and Schulz, Heinrich and Saalbach, Axel and Arnrich, Bert}, title = {Defending against Reconstruction Attacks through Differentially Private Federated Learning for Classification of Heterogeneous Chest X-ray Data}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, number = {14}, doi = {10.25932/publishup-58132}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-581322}, pages = {25}, year = {2022}, abstract = {Privacy regulations and the physical distribution of heterogeneous data are often primary concerns for the development of deep learning models in a medical context. This paper evaluates the feasibility of differentially private federated learning for chest X-ray classification as a defense against data privacy attacks. To the best of our knowledge, we are the first to directly compare the impact of differentially private training on two different neural network architectures, DenseNet121 and ResNet50. Extending the federated learning environments previously analyzed in terms of privacy, we simulated a heterogeneous and imbalanced federated setting by distributing images from the public CheXpert and Mendeley chest X-ray datasets unevenly among 36 clients. Both non-private baseline models achieved an area under the receiver operating characteristic curve (AUC) of 0.940.94 on the binary classification task of detecting the presence of a medical finding. We demonstrate that both model architectures are vulnerable to privacy violation by applying image reconstruction attacks to local model updates from individual clients. The attack was particularly successful during later training stages. To mitigate the risk of a privacy breach, we integrated R{\´e}nyi differential privacy with a Gaussian noise mechanism into local model training. We evaluate model performance and attack vulnerability for privacy budgets ε∈{1,3,6,10}�∈{1,3,6,10}. The DenseNet121 achieved the best utility-privacy trade-off with an AUC of 0.940.94 for ε=6�=6. Model performance deteriorated slightly for individual clients compared to the non-private baseline. The ResNet50 only reached an AUC of 0.760.76 in the same privacy setting. Its performance was inferior to that of the DenseNet121 for all considered privacy constraints, suggesting that the DenseNet121 architecture is more robust to differentially private training.}, language = {en} } @phdthesis{Draisbach2022, author = {Draisbach, Uwe}, title = {Efficient duplicate detection and the impact of transitivity}, doi = {10.25932/publishup-57214}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-572140}, school = {Universit{\"a}t Potsdam}, pages = {x, 150}, year = {2022}, abstract = {Duplicate detection describes the process of finding multiple representations of the same real-world entity in the absence of a unique identifier, and has many application areas, such as customer relationship management, genealogy and social sciences, or online shopping. Due to the increasing amount of data in recent years, the problem has become even more challenging on the one hand, but has led to a renaissance in duplicate detection research on the other hand. This thesis examines the effects and opportunities of transitive relationships on the duplicate detection process. Transitivity implies that if record pairs ⟨ri,rj⟩ and ⟨rj,rk⟩ are classified as duplicates, then also record pair ⟨ri,rk⟩ has to be a duplicate. However, this reasoning might contradict with the pairwise classification, which is usually based on the similarity of objects. An essential property of similarity, in contrast to equivalence, is that similarity is not necessarily transitive. First, we experimentally evaluate the effect of an increasing data volume on the threshold selection to classify whether a record pair is a duplicate or non-duplicate. Our experiments show that independently of the pair selection algorithm and the used similarity measure, selecting a suitable threshold becomes more difficult with an increasing number of records due to an increased probability of adding a false duplicate to an existing cluster. Thus, the best threshold changes with the dataset size, and a good threshold for a small (possibly sampled) dataset is not necessarily a good threshold for a larger (possibly complete) dataset. As data grows over time, earlier selected thresholds are no longer a suitable choice, and the problem becomes worse for datasets with larger clusters. Second, we present with the Duplicate Count Strategy (DCS) and its enhancement DCS++ two alternatives to the standard Sorted Neighborhood Method (SNM) for the selection of candidate record pairs. DCS adapts SNMs window size based on the number of detected duplicates and DCS++ uses transitive dependencies to save complex comparisons for finding duplicates in larger clusters. We prove that with a proper (domain- and data-independent!) threshold, DCS++ is more efficient than SNM without loss of effectiveness. Third, we tackle the problem of contradicting pairwise classifications. Usually, the transitive closure is used for pairwise classifications to obtain a transitively closed result set. However, the transitive closure disregards negative classifications. We present three new and several existing clustering algorithms and experimentally evaluate them on various datasets and under various algorithm configurations. The results show that the commonly used transitive closure is inferior to most other clustering algorithms, especially for the precision of results. In scenarios with larger clusters, our proposed EMCC algorithm is, together with Markov Clustering, the best performing clustering approach for duplicate detection, although its runtime is longer than Markov Clustering due to the subexponential time complexity. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.}, language = {en} } @phdthesis{Niephaus2022, author = {Niephaus, Fabio}, title = {Exploratory tool-building platforms for polyglot virtual machines}, doi = {10.25932/publishup-57177}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-571776}, school = {Universit{\"a}t Potsdam}, pages = {xxi, 249}, year = {2022}, abstract = {Polyglot programming allows developers to use multiple programming languages within the same software project. While it is common to use more than one language in certain programming domains, developers also apply polyglot programming for other purposes such as to re-use software written in other languages. Although established approaches to polyglot programming come with significant limitations, for example, in terms of performance and tool support, developers still use them to be able to combine languages. Polyglot virtual machines (VMs) such as GraalVM provide a new level of polyglot programming, allowing languages to directly interact with each other. This reduces the amount of glue code needed to combine languages, results in better performance, and enables tools such as debuggers to work across languages. However, only a little research has focused on novel tools that are designed to support developers in building software with polyglot VMs. One reason is that tool-building is often an expensive activity, another one is that polyglot VMs are still a moving target as their use cases and requirements are not yet well understood. In this thesis, we present an approach that builds on existing self-sustaining programming systems such as Squeak/Smalltalk to enable exploratory programming, a practice for exploring and gathering software requirements, and re-use their extensive tool-building capabilities in the context of polyglot VMs. Based on TruffleSqueak, our implementation for the GraalVM, we further present five case studies that demonstrate how our approach helps tool developers to design and build tools for polyglot programming. We further show that TruffleSqueak can also be used by application developers to build and evolve polyglot applications at run-time and by language and runtime developers to understand the dynamic behavior of GraalVM languages and internals. Since our platform allows all these developers to apply polyglot programming, it can further help to better understand the advantages, use cases, requirements, and challenges of polyglot VMs. Moreover, we demonstrate that our approach can also be applied to other polyglot VMs and that insights gained through it are transferable to other programming systems. We conclude that our research on tools for polyglot programming is an important step toward making polyglot VMs more approachable for developers in practice. With good tool support, we believe polyglot VMs can make it much more common for developers to take advantage of multiple languages and their ecosystems when building software.}, language = {en} } @misc{FehrJaramilloGutierrezOalaetal.2022, author = {Fehr, Jana and Jaramillo-Gutierrez, Giovanna and Oala, Luis and Gr{\"o}schel, Matthias I. and Bierwirth, Manuel and Balachandran, Pradeep and Werneck-Leite, Alixandro and Lippert, Christoph}, title = {Piloting a Survey-Based Assessment of Transparency and Trustworthiness with Three Medical AI Tools}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, number = {15}, doi = {10.25932/publishup-58328}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-583281}, pages = {30}, year = {2022}, abstract = {Artificial intelligence (AI) offers the potential to support healthcare delivery, but poorly trained or validated algorithms bear risks of harm. Ethical guidelines stated transparency about model development and validation as a requirement for trustworthy AI. Abundant guidance exists to provide transparency through reporting, but poorly reported medical AI tools are common. To close this transparency gap, we developed and piloted a framework to quantify the transparency of medical AI tools with three use cases. Our framework comprises a survey to report on the intended use, training and validation data and processes, ethical considerations, and deployment recommendations. The transparency of each response was scored with either 0, 0.5, or 1 to reflect if the requested information was not, partially, or fully provided. Additionally, we assessed on an analogous three-point scale if the provided responses fulfilled the transparency requirement for a set of trustworthiness criteria from ethical guidelines. The degree of transparency and trustworthiness was calculated on a scale from 0\% to 100\%. Our assessment of three medical AI use cases pin-pointed reporting gaps and resulted in transparency scores of 67\% for two use cases and one with 59\%. We report anecdotal evidence that business constraints and limited information from external datasets were major obstacles to providing transparency for the three use cases. The observed transparency gaps also lowered the degree of trustworthiness, indicating compliance gaps with ethical guidelines. All three pilot use cases faced challenges to provide transparency about medical AI tools, but more studies are needed to investigate those in the wider medical AI sector. Applying this framework for an external assessment of transparency may be infeasible if business constraints prevent the disclosure of information. New strategies may be necessary to enable audits of medical AI tools while preserving business secrets.}, language = {en} } @phdthesis{Rothenberger2022, author = {Rothenberger, Ralf}, title = {Satisfiability thresholds for non-uniform random k-SAT}, doi = {10.25932/publishup-54970}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-549702}, school = {Universit{\"a}t Potsdam}, pages = {x, 163}, year = {2022}, abstract = {Boolean Satisfiability (SAT) is one of the problems at the core of theoretical computer science. It was the first problem proven to be NP-complete by Cook and, independently, by Levin. Nowadays it is conjectured that SAT cannot be solved in sub-exponential time. Thus, it is generally assumed that SAT and its restricted version k-SAT are hard to solve. However, state-of-the-art SAT solvers can solve even huge practical instances of these problems in a reasonable amount of time. Why is SAT hard in theory, but easy in practice? One approach to answering this question is investigating the average runtime of SAT. In order to analyze this average runtime the random k-SAT model was introduced. The model generates all k-SAT instances with n variables and m clauses with uniform probability. Researching random k-SAT led to a multitude of insights and tools for analyzing random structures in general. One major observation was the emergence of the so-called satisfiability threshold: A phase transition point in the number of clauses at which the generated formulas go from asymptotically almost surely satisfiable to asymptotically almost surely unsatisfiable. Additionally, instances around the threshold seem to be particularly hard to solve. In this thesis we analyze a more general model of random k-SAT that we call non-uniform random k-SAT. In contrast to the classical model each of the n Boolean variables now has a distinct probability of being drawn. For each of the m clauses we draw k variables according to the variable distribution and choose their signs uniformly at random. Non-uniform random k-SAT gives us more control over the distribution of Boolean variables in the resulting formulas. This allows us to tailor distributions to the ones observed in practice. Notably, non-uniform random k-SAT contains the previously proposed models random k-SAT, power-law random k-SAT and geometric random k-SAT as special cases. We analyze the satisfiability threshold in non-uniform random k-SAT depending on the variable probability distribution. Our goal is to derive conditions on this distribution under which an equivalent of the satisfiability threshold conjecture holds. We start with the arguably simpler case of non-uniform random 2-SAT. For this model we show under which conditions a threshold exists, if it is sharp or coarse, and what the leading constant of the threshold function is. These are exactly the three ingredients one needs in order to prove or disprove the satisfiability threshold conjecture. For non-uniform random k-SAT with k=3 we only prove sufficient conditions under which a threshold exists. We also show some properties of the variable probabilities under which the threshold is sharp in this case. These are the first results on the threshold behavior of non-uniform random k-SAT.}, language = {en} } @article{HagedornHuegleSchlosser2022, author = {Hagedorn, Christopher and Huegle, Johannes and Schlosser, Rainer}, title = {Understanding unforeseen production downtimes in manufacturing processes using log data-driven causal reasoning}, series = {Journal of intelligent manufacturing}, volume = {33}, journal = {Journal of intelligent manufacturing}, number = {7}, publisher = {Springer}, address = {Dordrecht}, issn = {0956-5515}, doi = {10.1007/s10845-022-01952-x}, pages = {2027 -- 2043}, year = {2022}, abstract = {In discrete manufacturing, the knowledge about causal relationships makes it possible to avoid unforeseen production downtimes by identifying their root causes. Learning causal structures from real-world settings remains challenging due to high-dimensional data, a mix of discrete and continuous variables, and requirements for preprocessing log data under the causal perspective. In our work, we address these challenges proposing a process for causal reasoning based on raw machine log data from production monitoring. Within this process, we define a set of transformation rules to extract independent and identically distributed observations. Further, we incorporate a variable selection step to handle high-dimensionality and a discretization step to include continuous variables. We enrich a commonly used causal structure learning algorithm with domain-related orientation rules, which provides a basis for causal reasoning. We demonstrate the process on a real-world dataset from a globally operating precision mechanical engineering company. The dataset contains over 40 million log data entries from production monitoring of a single machine. In this context, we determine the causal structures embedded in operational processes. Further, we examine causal effects to support machine operators in avoiding unforeseen production stops, i.e., by detaining machine operators from drawing false conclusions on impacting factors of unforeseen production stops based on experience.}, language = {en} } @phdthesis{Jain2022, author = {Jain, Nitisha}, title = {Representation and curation of knowledge graphs with embeddings}, doi = {10.25932/publishup-61224}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-612240}, school = {Universit{\"a}t Potsdam}, pages = {ii, 104}, year = {2022}, abstract = {Knowledge graphs are structured repositories of knowledge that store facts about the general world or a particular domain in terms of entities and their relationships. Owing to the heterogeneity of use cases that are served by them, there arises a need for the automated construction of domain- specific knowledge graphs from texts. While there have been many research efforts towards open information extraction for automated knowledge graph construction, these techniques do not perform well in domain-specific settings. Furthermore, regardless of whether they are constructed automatically from specific texts or based on real-world facts that are constantly evolving, all knowledge graphs inherently suffer from incompleteness as well as errors in the information they hold. This thesis investigates the challenges encountered during knowledge graph construction and proposes techniques for their curation (a.k.a. refinement) including the correction of semantic ambiguities and the completion of missing facts. Firstly, we leverage existing approaches for the automatic construction of a knowledge graph in the art domain with open information extraction techniques and analyse their limitations. In particular, we focus on the challenging task of named entity recognition for artwork titles and show empirical evidence of performance improvement with our proposed solution for the generation of annotated training data. Towards the curation of existing knowledge graphs, we identify the issue of polysemous relations that represent different semantics based on the context. Having concrete semantics for relations is important for downstream appli- cations (e.g. question answering) that are supported by knowledge graphs. Therefore, we define the novel task of finding fine-grained relation semantics in knowledge graphs and propose FineGReS, a data-driven technique that discovers potential sub-relations with fine-grained meaning from existing pol- ysemous relations. We leverage knowledge representation learning methods that generate low-dimensional vectors (or embeddings) for knowledge graphs to capture their semantics and structure. The efficacy and utility of the proposed technique are demonstrated by comparing it with several baselines on the entity classification use case. Further, we explore the semantic representations in knowledge graph embed- ding models. In the past decade, these models have shown state-of-the-art results for the task of link prediction in the context of knowledge graph comple- tion. In view of the popularity and widespread application of the embedding techniques not only for link prediction but also for different semantic tasks, this thesis presents a critical analysis of the embeddings by quantitatively measuring their semantic capabilities. We investigate and discuss the reasons for the shortcomings of embeddings in terms of the characteristics of the underlying knowledge graph datasets and the training techniques used by popular models. Following up on this, we propose ReasonKGE, a novel method for generating semantically enriched knowledge graph embeddings by taking into account the semantics of the facts that are encapsulated by an ontology accompanying the knowledge graph. With a targeted, reasoning-based method for generating negative samples during the training of the models, ReasonKGE is able to not only enhance the link prediction performance, but also reduce the number of semantically inconsistent predictions made by the resultant embeddings, thus improving the quality of knowledge graphs.}, language = {en} } @article{CaselFischbeckFriedrichetal.2022, author = {Casel, Katrin and Fischbeck, Philipp and Friedrich, Tobias and G{\"o}bel, Andreas and Lagodzinski, J. A. Gregor}, title = {Zeros and approximations of Holant polynomials on the complex plane}, series = {Computational complexity : CC}, volume = {31}, journal = {Computational complexity : CC}, number = {2}, publisher = {Springer}, address = {Basel}, issn = {1016-3328}, doi = {10.1007/s00037-022-00226-5}, pages = {52}, year = {2022}, abstract = {We present fully polynomial time approximation schemes for a broad class of Holant problems with complex edge weights, which we call Holant polynomials. We transform these problems into partition functions of abstract combinatorial structures known as polymers in statistical physics. Our method involves establishing zero-free regions for the partition functions of polymer models and using the most significant terms of the cluster expansion to approximate them. Results of our technique include new approximation and sampling algorithms for a diverse class of Holant polynomials in the low-temperature regime (i.e. small external field) and approximation algorithms for general Holant problems with small signature weights. Additionally, we give randomised approximation and sampling algorithms with faster running times for more restrictive classes. Finally, we improve the known zero-free regions for a perfect matching polynomial.}, language = {en} } @misc{HeckerSteckhanEybenetal.2022, author = {Hecker, Pascal and Steckhan, Nico and Eyben, Florian and Schuller, Bj{\"o}rn Wolfgang and Arnrich, Bert}, title = {Voice Analysis for Neurological Disorder Recognition - A Systematic Review and Perspective on Emerging Trends}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, number = {13}, doi = {10.25932/publishup-58101}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-581019}, pages = {16}, year = {2022}, abstract = {Quantifying neurological disorders from voice is a rapidly growing field of research and holds promise for unobtrusive and large-scale disorder monitoring. The data recording setup and data analysis pipelines are both crucial aspects to effectively obtain relevant information from participants. Therefore, we performed a systematic review to provide a high-level overview of practices across various neurological disorders and highlight emerging trends. PRISMA-based literature searches were conducted through PubMed, Web of Science, and IEEE Xplore to identify publications in which original (i.e., newly recorded) datasets were collected. Disorders of interest were psychiatric as well as neurodegenerative disorders, such as bipolar disorder, depression, and stress, as well as amyotrophic lateral sclerosis amyotrophic lateral sclerosis, Alzheimer's, and Parkinson's disease, and speech impairments (aphasia, dysarthria, and dysphonia). Of the 43 retrieved studies, Parkinson's disease is represented most prominently with 19 discovered datasets. Free speech and read speech tasks are most commonly used across disorders. Besides popular feature extraction toolkits, many studies utilise custom-built feature sets. Correlations of acoustic features with psychiatric and neurodegenerative disorders are presented. In terms of analysis, statistical analysis for significance of individual features is commonly used, as well as predictive modeling approaches, especially with support vector machines and a small number of artificial neural networks. An emerging trend and recommendation for future studies is to collect data in everyday life to facilitate longitudinal data collection and to capture the behavior of participants more naturally. Another emerging trend is to record additional modalities to voice, which can potentially increase analytical performance.}, language = {en} } @phdthesis{Kovacs2022, author = {Kov{\´a}cs, R{\´o}bert}, title = {Human-scale personal fabrication}, doi = {10.25932/publishup-55539}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-555398}, school = {Universit{\"a}t Potsdam}, pages = {139}, year = {2022}, abstract = {The availability of commercial 3D printers and matching 3D design software has allowed a wide range of users to create physical prototypes - as long as these objects are not larger than hand size. However, when attempting to create larger, "human-scale" objects, such as furniture, not only are these machines too small, but also the commonly used 3D design software is not equipped to design with forces in mind — since forces increase disproportionately with scale. In this thesis, we present a series of end-to-end fabrication software systems that support users in creating human-scale objects. They achieve this by providing three main functions that regular "small-scale" 3D printing software does not offer: (1) subdivision of the object into small printable components combined with ready-made objects, (2) editing based on predefined elements sturdy enough for larger scale, i.e., trusses, and (3) functionality for analyzing, detecting, and fixing structural weaknesses. The presented software systems also assist the fabrication process based on either 3D printing or steel welding technology. The presented systems focus on three levels of engineering challenges: (1) fabricating static load-bearing objects, (2) creating mechanisms that involve motion, such as kinematic installations, and finally (3) designing mechanisms with dynamic repetitive movement where power and energy play an important role. We demonstrate and verify the versatility of our systems by building and testing human-scale prototypes, ranging from furniture pieces, pavilions, to animatronic installations and playground equipment. We have also shared our system with schools, fablabs, and fabrication enthusiasts, who have successfully created human-scale objects that can withstand with human-scale forces.}, language = {en} } @misc{MontiRautenstrauchGhanbarietal.2022, author = {Monti, Remo and Rautenstrauch, Pia and Ghanbari, Mahsa and Rani James, Alva and Kirchler, Matthias and Ohler, Uwe and Konigorski, Stefan and Lippert, Christoph}, title = {Identifying interpretable gene-biomarker associations with functionally informed kernel-based tests in 190,000 exomes}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, number = {16}, doi = {10.25932/publishup-58607}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-586078}, pages = {16}, year = {2022}, abstract = {Here we present an exome-wide rare genetic variant association study for 30 blood biomarkers in 191,971 individuals in the UK Biobank. We compare gene- based association tests for separate functional variant categories to increase interpretability and identify 193 significant gene-biomarker associations. Genes associated with biomarkers were ~ 4.5-fold enriched for conferring Mendelian disorders. In addition to performing weighted gene-based variant collapsing tests, we design and apply variant-category-specific kernel-based tests that integrate quantitative functional variant effect predictions for mis- sense variants, splicing and the binding of RNA-binding proteins. For these tests, we present a computationally efficient combination of the likelihood- ratio and score tests that found 36\% more associations than the score test alone while also controlling the type-1 error. Kernel-based tests identified 13\% more associations than their gene-based collapsing counterparts and had advantages in the presence of gain of function missense variants. We introduce local collapsing by amino acid position for missense variants and use it to interpret associations and identify potential novel gain of function variants in PIEZO1. Our results show the benefits of investigating different functional mechanisms when performing rare-variant association tests, and demonstrate pervasive rare-variant contribution to biomarker variability.}, language = {en} } @article{MontiRautenstrauchGhanbarietal.2022, author = {Monti, Remo and Rautenstrauch, Pia and Ghanbari, Mahsa and Rani James, Alva and Kirchler, Matthias and Ohler, Uwe and Konigorski, Stefan and Lippert, Christoph}, title = {Identifying interpretable gene-biomarker associations with functionally informed kernel-based tests in 190,000 exomes}, series = {Nature Communications}, volume = {13}, journal = {Nature Communications}, publisher = {Nature Publishing Group UK}, address = {London}, issn = {2041-1723}, doi = {10.1038/s41467-022-32864-2}, pages = {16}, year = {2022}, abstract = {Here we present an exome-wide rare genetic variant association study for 30 blood biomarkers in 191,971 individuals in the UK Biobank. We compare gene- based association tests for separate functional variant categories to increase interpretability and identify 193 significant gene-biomarker associations. Genes associated with biomarkers were ~ 4.5-fold enriched for conferring Mendelian disorders. In addition to performing weighted gene-based variant collapsing tests, we design and apply variant-category-specific kernel-based tests that integrate quantitative functional variant effect predictions for mis- sense variants, splicing and the binding of RNA-binding proteins. For these tests, we present a computationally efficient combination of the likelihood- ratio and score tests that found 36\% more associations than the score test alone while also controlling the type-1 error. Kernel-based tests identified 13\% more associations than their gene-based collapsing counterparts and had advantages in the presence of gain of function missense variants. We introduce local collapsing by amino acid position for missense variants and use it to interpret associations and identify potential novel gain of function variants in PIEZO1. Our results show the benefits of investigating different functional mechanisms when performing rare-variant association tests, and demonstrate pervasive rare-variant contribution to biomarker variability.}, language = {en} } @article{UlrichLutfiRutzenetal.2022, author = {Ulrich, Jens-Uwe and Lutfi, Ahmad and Rutzen, Kilian and Renard, Bernhard Y.}, title = {ReadBouncer}, series = {Bioinformatics}, volume = {38}, journal = {Bioinformatics}, number = {SUPPL 1}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1367-4803}, doi = {10.1093/bioinformatics/btac223}, pages = {153 -- 160}, year = {2022}, abstract = {Motivation: Nanopore sequencers allow targeted sequencing of interesting nucleotide sequences by rejecting other sequences from individual pores. This feature facilitates the enrichment of low-abundant sequences by depleting overrepresented ones in-silico. Existing tools for adaptive sampling either apply signal alignment, which cannot handle human-sized reference sequences, or apply read mapping in sequence space relying on fast graphical processing units (GPU) base callers for real-time read rejection. Using nanopore long-read mapping tools is also not optimal when mapping shorter reads as usually analyzed in adaptive sampling applications. Results: Here, we present a new approach for nanopore adaptive sampling that combines fast CPU and GPU base calling with read classification based on Interleaved Bloom Filters. ReadBouncer improves the potential enrichment of low abundance sequences by its high read classification sensitivity and specificity, outperforming existing tools in the field. It robustly removes even reads belonging to large reference sequences while running on commodity hardware without GPUs, making adaptive sampling accessible for in-field researchers. Readbouncer also provides a user-friendly interface and installer files for end-users without a bioinformatics background.}, language = {en} } @article{RichlySchlosserBoissier2022, author = {Richly, Keven and Schlosser, Rainer and Boissier, Martin}, title = {Budget-conscious fine-grained configuration optimization for spatio-temporal applications}, series = {Proceedings of the VLDB Endowment}, volume = {15}, journal = {Proceedings of the VLDB Endowment}, number = {13}, publisher = {Association for Computing Machinery (ACM)}, address = {[New York]}, issn = {2150-8097}, doi = {10.14778/3565838.3565858}, pages = {4079 -- 4092}, year = {2022}, abstract = {Based on the performance requirements of modern spatio-temporal data mining applications, in-memory database systems are often used to store and process the data. To efficiently utilize the scarce DRAM capacities, modern database systems support various tuning possibilities to reduce the memory footprint (e.g., data compression) or increase performance (e.g., additional indexes). However, the selection of cost and performance balancing configurations is challenging due to the vast number of possible setups consisting of mutually dependent individual decisions. In this paper, we introduce a novel approach to jointly optimize the compression, sorting, indexing, and tiering configuration for spatio-temporal workloads. Further, we consider horizontal data partitioning, which enables the independent application of different tuning options on a fine-grained level. We propose different linear programming (LP) models addressing cost dependencies at different levels of accuracy to compute optimized tuning configurations for a given workload and memory budgets. To yield maintainable and robust configurations, we extend our LP-based approach to incorporate reconfiguration costs as well as a worst-case optimization for potential workload scenarios. Further, we demonstrate on a real-world dataset that our models allow to significantly reduce the memory footprint with equal performance or increase the performance with equal memory size compared to existing tuning heuristics.}, language = {en} } @article{OmolaoyeOmolaoyeKandasamyetal.2022, author = {Omolaoye, Temidayo S. and Omolaoye, Victor Adelakun and Kandasamy, Richard K. and Hachim, Mahmood Yaseen and Du Plessis, Stefan S.}, title = {Omics and male infertility}, series = {Life : open access journal}, volume = {12}, journal = {Life : open access journal}, number = {2}, publisher = {MDPI}, address = {Basel}, issn = {2075-1729}, doi = {10.3390/life12020280}, pages = {21}, year = {2022}, abstract = {Male infertility is a multifaceted disorder affecting approximately 50\% of male partners in infertile couples. Over the years, male infertility has been diagnosed mainly through semen analysis, hormone evaluations, medical records and physical examinations, which of course are fundamental, but yet inefficient, because 30\% of male infertility cases remain idiopathic. This dilemmatic status of the unknown needs to be addressed with more sophisticated and result-driven technologies and/or techniques. Genetic alterations have been linked with male infertility, thereby unveiling the practicality of investigating this disorder from the "omics" perspective. Omics aims at analyzing the structure and functions of a whole constituent of a given biological function at different levels, including the molecular gene level (genomics), transcript level (transcriptomics), protein level (proteomics) and metabolites level (metabolomics). In the current study, an overview of the four branches of omics and their roles in male infertility are briefly discussed; the potential usefulness of assessing transcriptomic data to understand this pathology is also elucidated. After assessing the publicly obtainable transcriptomic data for datasets on male infertility, a total of 1385 datasets were retrieved, of which 10 datasets met the inclusion criteria and were used for further analysis. These datasets were classified into groups according to the disease or cause of male infertility. The groups include non-obstructive azoospermia (NOA), obstructive azoospermia (OA), non-obstructive and obstructive azoospermia (NOA and OA), spermatogenic dysfunction, sperm dysfunction, and Y chromosome microdeletion. Findings revealed that 8 genes (LDHC, PDHA2, TNP1, TNP2, ODF1, ODF2, SPINK2, PCDHB3) were commonly differentially expressed between all disease groups. Likewise, 56 genes were common between NOA versus NOA and OA (ADAD1, BANF2, BCL2L14, C12orf50, C20orf173, C22orf23, C6orf99, C9orf131, C9orf24, CABS1, CAPZA3, CCDC187, CCDC54, CDKN3, CEP170, CFAP206, CRISP2, CT83, CXorf65, FAM209A, FAM71F1, FAM81B, GALNTL5, GTSF1, H1FNT, HEMGN, HMGB4, KIF2B, LDHC, LOC441601, LYZL2, ODF1, ODF2, PCDHB3, PDHA2, PGK2, PIH1D2, PLCZ1, PROCA1, RIMBP3, ROPN1L, SHCBP1L, SMCP, SPATA16, SPATA19, SPINK2, TEX33, TKTL2, TMCO2, TMCO5A, TNP1, TNP2, TSPAN16, TSSK1B, TTLL2, UBQLN3). These genes, particularly the above-mentioned 8 genes, are involved in diverse biological processes such as germ cell development, spermatid development, spermatid differentiation, regulation of proteolysis, spermatogenesis and metabolic processes. Owing to the stage-specific expression of these genes, any mal-expression can ultimately lead to male infertility. Therefore, currently available data on all branches of omics relating to male fertility can be used to identify biomarkers for diagnosing male infertility, which can potentially help in unravelling some idiopathic cases.}, language = {en} } @article{WittigMirandaHoelzeretal.2022, author = {Wittig, Alice and Miranda, Fabio Malcher and H{\"o}lzer, Martin and Altenburg, Tom and Bartoszewicz, Jakub Maciej and Beyvers, Sebastian and Dieckmann, Marius Alfred and Genske, Ulrich and Giese, Sven Hans-Joachim and Nowicka, Melania and Richard, Hugues and Schiebenhoefer, Henning and Schmachtenberg, Anna-Juliane and Sieben, Paul and Tang, Ming and Tembrockhaus, Julius and Renard, Bernhard Y. and Fuchs, Stephan}, title = {CovRadar}, series = {Bioinformatics}, volume = {38}, journal = {Bioinformatics}, number = {17}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1367-4803}, doi = {10.1093/bioinformatics/btac411}, pages = {4223 -- 4225}, year = {2022}, abstract = {The ongoing pandemic caused by SARS-CoV-2 emphasizes the importance of genomic surveillance to understand the evolution of the virus, to monitor the viral population, and plan epidemiological responses. Detailed analysis, easy visualization and intuitive filtering of the latest viral sequences are powerful for this purpose. We present CovRadar, a tool for genomic surveillance of the SARS-CoV-2 Spike protein. CovRadar consists of an analytical pipeline and a web application that enable the analysis and visualization of hundreds of thousand sequences. First, CovRadar extracts the regions of interest using local alignment, then builds a multiple sequence alignment, infers variants and consensus and finally presents the results in an interactive app, making accessing and reporting simple, flexible and fast.}, language = {en} } @phdthesis{Schirneck2022, author = {Schirneck, Friedrich Martin}, title = {Enumeration algorithms in data profiling}, doi = {10.25932/publishup-55672}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-556726}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 192}, year = {2022}, abstract = {Data profiling is the extraction of metadata from relational databases. An important class of metadata are multi-column dependencies. They come associated with two computational tasks. The detection problem is to decide whether a dependency of a given type and size holds in a database. The discovery problem instead asks to enumerate all valid dependencies of that type. We investigate the two problems for three types of dependencies: unique column combinations (UCCs), functional dependencies (FDs), and inclusion dependencies (INDs). We first treat the parameterized complexity of the detection variants. We prove that the detection of UCCs and FDs, respectively, is W[2]-complete when parameterized by the size of the dependency. The detection of INDs is shown to be one of the first natural W[3]-complete problems. We further settle the enumeration complexity of the three discovery problems by presenting parsimonious equivalences with well-known enumeration problems. Namely, the discovery of UCCs is equivalent to the famous transversal hypergraph problem of enumerating the hitting sets of a hypergraph. The discovery of FDs is equivalent to the simultaneous enumeration of the hitting sets of multiple input hypergraphs. Finally, the discovery of INDs is shown to be equivalent to enumerating the satisfying assignments of antimonotone, 3-normalized Boolean formulas. In the remainder of the thesis, we design and analyze discovery algorithms for unique column combinations. Since this is as hard as the general transversal hypergraph problem, it is an open question whether the UCCs of a database can be computed in output-polynomial time in the worst case. For the analysis, we therefore focus on instances that are structurally close to databases in practice, most notably, inputs that have small solutions. The equivalence between UCCs and hitting sets transfers the computational hardness, but also allows us to apply ideas from hypergraph theory to data profiling. We devise an discovery algorithm that runs in polynomial space on arbitrary inputs and achieves polynomial delay whenever the maximum size of any minimal UCC is bounded. Central to our approach is the extension problem for minimal hitting sets, that is, to decide for a set of vertices whether they are contained in any minimal solution. We prove that this is yet another problem that is complete for the complexity class W[3], when parameterized by the size of the set that is to be extended. We also give several conditional lower bounds under popular hardness conjectures such as the Strong Exponential Time Hypothesis (SETH). The lower bounds suggest that the running time of our algorithm for the extension problem is close to optimal. We further conduct an empirical analysis of our discovery algorithm on real-world databases to confirm that the hitting set perspective on data profiling has merits also in practice. We show that the resulting enumeration times undercut their theoretical worst-case bounds on practical data, and that the memory consumption of our method is much smaller than that of previous solutions. During the analysis we make two observations about the connection between databases and their corresponding hypergraphs. On the one hand, the hypergraph representations containing all relevant information are usually significantly smaller than the original inputs. On the other hand, obtaining those hypergraphs is the actual bottleneck of any practical application. The latter often takes much longer than enumerating the solutions, which is in stark contrast to the fact that the preprocessing is guaranteed to be polynomial while the enumeration may take exponential time. To make the first observation rigorous, we introduce a maximum-entropy model for non-uniform random hypergraphs and prove that their expected number of minimal hyperedges undergoes a phase transition with respect to the total number of edges. The result also explains why larger databases may have smaller hypergraphs. Motivated by the second observation, we present a new kind of UCC discovery algorithm called Hitting Set Enumeration with Partial Information and Validation (HPIValid). It utilizes the fast enumeration times in practice in order to speed up the computation of the corresponding hypergraph. This way, we sidestep the bottleneck while maintaining the advantages of the hitting set perspective. An exhaustive empirical evaluation shows that HPIValid outperforms the current state of the art in UCC discovery. It is capable of processing databases that were previously out of reach for data profiling.}, language = {en} } @book{EichenrothReinHirschfeld2022, author = {Eichenroth, Friedrich and Rein, Patrick and Hirschfeld, Robert}, title = {Fast packrat parsing in a live programming environment}, series = {Technische Berichte des Hasso-Plattner-Instituts f{\"u}r Digital Engineering an der Universit{\"a}t Potsdam}, journal = {Technische Berichte des Hasso-Plattner-Instituts f{\"u}r Digital Engineering an der Universit{\"a}t Potsdam}, number = {135}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-503-3}, issn = {1613-5652}, doi = {10.25932/publishup-49124}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-491242}, publisher = {Universit{\"a}t Potsdam}, pages = {79}, year = {2022}, abstract = {Language developers who design domain-specific languages or new language features need a way to make fast changes to language definitions. Those fast changes require immediate feedback. Also, it should be possible to parse the developed languages quickly to handle extensive sets of code. Parsing expression grammars provides an easy to understand method for language definitions. Packrat parsing is a method to parse grammars of this kind, but this method is unable to handle left-recursion properly. Existing solutions either partially rewrite left-recursive rules and partly forbid them, or use complex extensions to packrat parsing that are hard to understand and cost-intensive. We investigated methods to make parsing as fast as possible, using easy to follow algorithms while not losing the ability to make fast changes to grammars. We focused our efforts on two approaches. One is to start from an existing technique for limited left-recursion rewriting and enhance it to work for general left-recursive grammars. The second approach is to design a grammar compilation process to find left-recursion before parsing, and in this way, reduce computational costs wherever possible and generate ready to use parser classes. Rewriting parsing expression grammars is a task that, if done in a general way, unveils a large number of cases such that any rewriting algorithm surpasses the complexity of other left-recursive parsing algorithms. Lookahead operators introduce this complexity. However, most languages have only little portions that are left-recursive and in virtually all cases, have no indirect or hidden left-recursion. This means that the distinction of left-recursive parts of grammars from components that are non-left-recursive holds great improvement potential for existing parsers. In this report, we list all the required steps for grammar rewriting to handle left-recursion, including grammar analysis, grammar rewriting itself, and syntax tree restructuring. Also, we describe the implementation of a parsing expression grammar framework in Squeak/Smalltalk and the possible interactions with the already existing parser Ohm/S. We quantitatively benchmarked this framework directing our focus on parsing time and the ability to use it in a live programming context. Compared with Ohm, we achieved massive parsing time improvements while preserving the ability to use our parser it as a live programming tool. The work is essential because, for one, we outlined the difficulties and complexity that come with grammar rewriting. Also, we removed the existing limitations that came with left-recursion by eliminating them before parsing.}, language = {en} } @book{FreundRaetschHradilaketal.2022, author = {Freund, Rieke and R{\"a}tsch, Jan Philip and Hradilak, Franziska and Vidic, Benedikt and Heß, Oliver and Lißner, Nils and W{\"o}lert, Hendrik and Lincke, Jens and Beckmann, Tom and Hirschfeld, Robert}, title = {Implementing a crowd-sourced picture archive for Bad Harzburg}, number = {149}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-545-3}, issn = {1613-5652}, doi = {10.25932/publishup-56029}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-560291}, publisher = {Universit{\"a}t Potsdam}, pages = {x, 191}, year = {2022}, abstract = {Pictures are a medium that helps make the past tangible and preserve memories. Without context, they are not able to do so. Pictures are brought to life by their associated stories. However, the older pictures become, the fewer contemporary witnesses can tell these stories. Especially for large, analog picture archives, knowledge and memories are spread over many people. This creates several challenges: First, the pictures must be digitized to save them from decaying and make them available to the public. Since a simple listing of all the pictures is confusing, the pictures should be structured accessibly. Second, known information that makes the stories vivid needs to be added to the pictures. Users should get the opportunity to contribute their knowledge and memories. To make this usable for all interested parties, even for older, less technophile generations, the interface should be intuitive and error-tolerant. The resulting requirements are not covered in their entirety by any existing software solution without losing the intuitive interface or the scalability of the system. Therefore, we have developed our digital picture archive within the scope of a bachelor project in cooperation with the Bad Harzburg-Stiftung. For the implementation of this web application, we use the UI framework React in the frontend, which communicates via a GraphQL interface with the Content Management System Strapi in the backend. The use of this system enables our project partner to create an efficient process from scanning analog pictures to presenting them to visitors in an organized and annotated way. To customize the solution for both picture delivery and information contribution for our target group, we designed prototypes and evaluated them with people from Bad Harzburg. This helped us gain valuable insights into our system's usability and future challenges as well as requirements. Our web application is already being used daily by our project partner. During the project, we still came up with numerous ideas for additional features to further support the exchange of knowledge.}, language = {en} } @book{SchneiderMaximovaGiese2022, author = {Schneider, Sven and Maximova, Maria and Giese, Holger}, title = {Invariant Analysis for Multi-Agent Graph Transformation Systems using k-Induction}, number = {143}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-531-6}, issn = {1613-5652}, doi = {10.25932/publishup-54585}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-545851}, publisher = {Universit{\"a}t Potsdam}, pages = {37}, year = {2022}, abstract = {The analysis of behavioral models such as Graph Transformation Systems (GTSs) is of central importance in model-driven engineering. However, GTSs often result in intractably large or even infinite state spaces and may be equipped with multiple or even infinitely many start graphs. To mitigate these problems, static analysis techniques based on finite symbolic representations of sets of states or paths thereof have been devised. We focus on the technique of k-induction for establishing invariants specified using graph conditions. To this end, k-induction generates symbolic paths backwards from a symbolic state representing a violation of a candidate invariant to gather information on how that violation could have been reached possibly obtaining contradictions to assumed invariants. However, GTSs where multiple agents regularly perform actions independently from each other cannot be analyzed using this technique as of now as the independence among backward steps may prevent the gathering of relevant knowledge altogether. In this paper, we extend k-induction to GTSs with multiple agents thereby supporting a wide range of additional GTSs. As a running example, we consider an unbounded number of shuttles driving on a large-scale track topology, which adjust their velocity to speed limits to avoid derailing. As central contribution, we develop pruning techniques based on causality and independence among backward steps and verify that k-induction remains sound under this adaptation as well as terminates in cases where it did not terminate before.}, language = {en} } @book{SchneiderMaximovaGiese2022, author = {Schneider, Sven and Maximova, Maria and Giese, Holger}, title = {Probabilistic metric temporal graph logic}, number = {146}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-532-3}, issn = {1613-5652}, doi = {10.25932/publishup-54586}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-545867}, publisher = {Universit{\"a}t Potsdam}, pages = {34}, year = {2022}, abstract = {Cyber-physical systems often encompass complex concurrent behavior with timing constraints and probabilistic failures on demand. The analysis whether such systems with probabilistic timed behavior adhere to a given specification is essential. When the states of the system can be represented by graphs, the rule-based formalism of Probabilistic Timed Graph Transformation Systems (PTGTSs) can be used to suitably capture structure dynamics as well as probabilistic and timed behavior of the system. The model checking support for PTGTSs w.r.t. properties specified using Probabilistic Timed Computation Tree Logic (PTCTL) has been already presented. Moreover, for timed graph-based runtime monitoring, Metric Temporal Graph Logic (MTGL) has been developed for stating metric temporal properties on identified subgraphs and their structural changes over time. In this paper, we (a) extend MTGL to the Probabilistic Metric Temporal Graph Logic (PMTGL) by allowing for the specification of probabilistic properties, (b) adapt our MTGL satisfaction checking approach to PTGTSs, and (c) combine the approaches for PTCTL model checking and MTGL satisfaction checking to obtain a Bounded Model Checking (BMC) approach for PMTGL. In our evaluation, we apply an implementation of our BMC approach in AutoGraph to a running example.}, language = {en} } @book{KlinkeVerhoevenRothetal.2022, author = {Klinke, Paula and Verhoeven, Silvan and Roth, Felix and Hagemann, Linus and Alnawa, Tarik and Lincke, Jens and Rein, Patrick and Hirschfeld, Robert}, title = {Tool support for collaborative creation of interactive storytelling media}, number = {141}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-521-7}, issn = {1613-5652}, doi = {10.25932/publishup-51857}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-518570}, publisher = {Universit{\"a}t Potsdam}, pages = {x, 167}, year = {2022}, abstract = {Scrollytellings are an innovative form of web content. Combining the benefits of books, images, movies, and video games, they are a tool to tell compelling stories and provide excellent learning opportunities. Due to their multi-modality, creating high-quality scrollytellings is not an easy task. Different professions, such as content designers, graphics designers, and developers, need to collaborate to get the best out of the possibilities the scrollytelling format provides. Collaboration unlocks great potential. However, content designers cannot create scrollytellings directly and always need to consult with developers to implement their vision. This can result in misunderstandings. Often, the resulting scrollytelling will not match the designer's vision sufficiently, causing unnecessary iterations. Our project partner Typeshift specializes in the creation of individualized scrollytellings for their clients. Examined existing solutions for authoring interactive content are not optimally suited for creating highly customized scrollytellings while still being able to manipulate all their elements programmatically. Based on their experience and expertise, we developed an editor to author scrollytellings in the lively.next live-programming environment. In this environment, a graphical user interface for content design is combined with powerful possibilities for programming behavior with the morphic system. The editor allows content designers to take on large parts of the creation process of scrollytellings on their own, such as creating the visible elements, animating content, and fine-tuning the scrollytelling. Hence, developers can focus on interactive elements such as simulations and games. Together with Typeshift, we evaluated the tool by recreating an existing scrollytelling and identified possible future enhancements. Our editor streamlines the creation process of scrollytellings. Content designers and developers can now both work on the same scrollytelling. Due to the editor inside of the lively.next environment, they can both work with a set of tools familiar to them and their traits. Thus, we mitigate unnecessary iterations and misunderstandings by enabling content designers to realize large parts of their vision of a scrollytelling on their own. Developers can add advanced and individual behavior. Thus, developers and content designers benefit from a clearer distribution of tasks while keeping the benefits of collaboration.}, language = {en} } @book{DuerschReinMattisetal.2022, author = {D{\"u}rsch, Falco and Rein, Patrick and Mattis, Toni and Hirschfeld, Robert}, title = {Learning from failure}, number = {145}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-528-6}, issn = {1613-5652}, doi = {10.25932/publishup-53755}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-537554}, publisher = {Universit{\"a}t Potsdam}, pages = {87}, year = {2022}, abstract = {Regression testing is a widespread practice in today's software industry to ensure software product quality. Developers derive a set of test cases, and execute them frequently to ensure that their change did not adversely affect existing functionality. As the software product and its test suite grow, the time to feedback during regression test sessions increases, and impedes programmer productivity: developers wait longer for tests to complete, and delays in fault detection render fault removal increasingly difficult. Test case prioritization addresses the problem of long feedback loops by reordering test cases, such that test cases of high failure probability run first, and test case failures become actionable early in the testing process. We ask, given test execution schedules reconstructed from publicly available data, to which extent can their fault detection efficiency improved, and which technique yields the most efficient test schedules with respect to APFD? To this end, we recover regression 6200 test sessions from the build log files of Travis CI, a popular continuous integration service, and gather 62000 accompanying changelists. We evaluate the efficiency of current test schedules, and examine the prioritization results of state-of-the-art lightweight, history-based heuristics. We propose and evaluate a novel set of prioritization algorithms, which connect software changes and test failures in a matrix-like data structure. Our studies indicate that the optimization potential is substantial, because the existing test plans score only 30\% APFD. The predictive power of past test failures proves to be outstanding: simple heuristics, such as repeating tests with failures in recent sessions, result in efficiency scores of 95\% APFD. The best-performing matrix-based heuristic achieves a similar score of 92.5\% APFD. In contrast to prior approaches, we argue that matrix-based techniques are useful beyond the scope of effective prioritization, and enable a number of use cases involving software maintenance. We validate our findings from continuous integration processes by extending a continuous testing tool within development environments with means of test prioritization, and pose further research questions. We think that our findings are suited to propel adoption of (continuous) testing practices, and that programmers' toolboxes should contain test prioritization as an existential productivity tool.}, language = {en} } @article{BlaesiusFriedrichLischeidetal.2022, author = {Bl{\"a}sius, Thomas and Friedrich, Tobias and Lischeid, Julius and Meeks, Kitty and Schirneck, Friedrich Martin}, title = {Efficiently enumerating hitting sets of hypergraphs arising in data profiling}, series = {Journal of computer and system sciences : JCSS}, volume = {124}, journal = {Journal of computer and system sciences : JCSS}, publisher = {Elsevier}, address = {San Diego}, issn = {0022-0000}, doi = {10.1016/j.jcss.2021.10.002}, pages = {192 -- 213}, year = {2022}, abstract = {The transversal hypergraph problem asks to enumerate the minimal hitting sets of a hypergraph. If the solutions have bounded size, Eiter and Gottlob [SICOMP'95] gave an algorithm running in output-polynomial time, but whose space requirement also scales with the output. We improve this to polynomial delay and space. Central to our approach is the extension problem, deciding for a set X of vertices whether it is contained in any minimal hitting set. We show that this is one of the first natural problems to be W[3]-complete. We give an algorithm for the extension problem running in time O(m(vertical bar X vertical bar+1) n) and prove a SETH-lower bound showing that this is close to optimal. We apply our enumeration method to the discovery problem of minimal unique column combinations from data profiling. Our empirical evaluation suggests that the algorithm outperforms its worst-case guarantees on hypergraphs stemming from real-world databases.}, language = {en} } @phdthesis{Hesse2022, author = {Hesse, G{\"u}nter}, title = {A benchmark for enterprise stream processing architectures}, doi = {10.25932/publishup-56600}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-566000}, school = {Universit{\"a}t Potsdam}, pages = {ix, 148}, year = {2022}, abstract = {Data stream processing systems (DSPSs) are a key enabler to integrate continuously generated data, such as sensor measurements, into enterprise applications. DSPSs allow to steadily analyze information from data streams, e.g., to monitor manufacturing processes and enable fast reactions to anomalous behavior. Moreover, DSPSs continuously filter, sample, and aggregate incoming streams of data, which reduces the data size, and thus data storage costs. The growing volumes of generated data have increased the demand for high-performance DSPSs, leading to a higher interest in these systems and to the development of new DSPSs. While having more DSPSs is favorable for users as it allows choosing the system that satisfies their requirements the most, it also introduces the challenge of identifying the most suitable DSPS regarding current needs as well as future demands. Having a solution to this challenge is important because replacements of DSPSs require the costly re-writing of applications if no abstraction layer is used for application development. However, quantifying performance differences between DSPSs is a difficult task. Existing benchmarks fail to integrate all core functionalities of DSPSs and lack tool support, which hinders objective result comparisons. Moreover, no current benchmark covers the combination of streaming data with existing structured business data, which is particularly relevant for companies. This thesis proposes a performance benchmark for enterprise stream processing called ESPBench. With enterprise stream processing, we refer to the combination of streaming and structured business data. Our benchmark design represents real-world scenarios and allows for an objective result comparison as well as scaling of data. The defined benchmark query set covers all core functionalities of DSPSs. The benchmark toolkit automates the entire benchmark process and provides important features, such as query result validation and a configurable data ingestion rate. To validate ESPBench and to ease the use of the benchmark, we propose an example implementation of the ESPBench queries leveraging the Apache Beam software development kit (SDK). The Apache Beam SDK is an abstraction layer designed for developing stream processing applications that is applied in academia as well as enterprise contexts. It allows to run the defined applications on any of the supported DSPSs. The performance impact of Apache Beam is studied in this dissertation as well. The results show that there is a significant influence that differs among DSPSs and stream processing applications. For validating ESPBench, we use the example implementation of the ESPBench queries developed using the Apache Beam SDK. We benchmark the implemented queries executed on three modern DSPSs: Apache Flink, Apache Spark Streaming, and Hazelcast Jet. The results of the study prove the functioning of ESPBench and its toolkit. ESPBench is capable of quantifying performance characteristics of DSPSs and of unveiling differences among systems. The benchmark proposed in this thesis covers all requirements to be applied in enterprise stream processing settings, and thus represents an improvement over the current state-of-the-art.}, language = {en} } @article{HiortSchlaffnerSteenetal.2022, author = {Hiort, Pauline and Schlaffner, Christoph N. and Steen, Judith A. and Renard, Bernhard Y. and Steen, Hanno}, title = {multiFLEX-LF: a computational approach to quantify the modification stoichiometries in label-free proteomics data sets}, series = {Journal of proteome research}, volume = {21}, journal = {Journal of proteome research}, number = {4}, publisher = {American Chemical Society}, address = {Washington}, issn = {1535-3893}, doi = {10.1021/acs.jproteome.1c00669}, pages = {899 -- 909}, year = {2022}, abstract = {In liquid-chromatography-tandem-mass-spectrometry-based proteomics, information about the presence and stoichiometry ofprotein modifications is not readily available. To overcome this problem,we developed multiFLEX-LF, a computational tool that builds uponFLEXIQuant, which detects modified peptide precursors and quantifiestheir modification extent by monitoring the differences between observedand expected intensities of the unmodified precursors. multiFLEX-LFrelies on robust linear regression to calculate the modification extent of agiven precursor relative to a within-study reference. multiFLEX-LF cananalyze entire label-free discovery proteomics data sets in a precursor-centric manner without preselecting a protein of interest. To analyzemodification dynamics and coregulated modifications, we hierarchicallyclustered the precursors of all proteins based on their computed relativemodification scores. We applied multiFLEX-LF to a data-independent-acquisition-based data set acquired using the anaphase-promoting complex/cyclosome (APC/C) isolated at various time pointsduring mitosis. The clustering of the precursors allows for identifying varying modification dynamics and ordering the modificationevents. Overall, multiFLEX-LF enables the fast identification of potentially differentially modified peptide precursors and thequantification of their differential modification extent in large data sets using a personal computer. Additionally, multiFLEX-LF candrive the large-scale investigation of the modification dynamics of peptide precursors in time-series and case-control studies.multiFLEX-LF is available athttps://gitlab.com/SteenOmicsLab/multiflex-lf.}, language = {en} }