@article{Steinroetter2021, author = {Steinr{\"o}tter, Bj{\"o}rn}, title = {Das Konzept einer datenaltruistischen Organisation}, series = {Datenschutz und Datensicherheit}, volume = {45}, journal = {Datenschutz und Datensicherheit}, number = {12}, publisher = {Springer}, address = {Berlin}, issn = {1862-2607}, doi = {10.1007/s11623-021-1539-6}, pages = {794 -- 798}, year = {2021}, abstract = {Dass Technologien wie Machine Learning-Anwendungen oder Big bzw. Smart Data- Verfahren unbedingt Daten in ausreichender Menge und G{\"u}te ben{\"o}tigen, erscheint inzwischen als Binsenweisheit. Vor diesem Hintergrund hat insbesondere der EU-Gesetzgeber f{\"u}r sich zuletzt ein neues Bet{\"a}tigungsfeld entdeckt, indem er versucht, auf unterschiedlichen Wegen Anreize zum Datenteilen zu schaffen, um Innovation zu kreieren. Hierzu z{\"a}hlt auch eine geradezu wohlt{\"o}nend mit ,,Datenaltruismus'' verschlagwortete Konstellation. Der Beitrag stellt die diesbez{\"u}glichen Regulierungserw{\"a}gungen auf supranationaler Ebene dar und nimmt eine erste Analyse vor.}, language = {de} } @article{SchindlerMoldenhawerStangeetal.2021, author = {Schindler, Daniel and Moldenhawer, Ted and Stange, Maike and Lepro, Valentino and Beta, Carsten and Holschneider, Matthias and Huisinga, Wilhelm}, title = {Analysis of protrusion dynamics in amoeboid cell motility by means of regularized contour flows}, series = {PLoS Computational Biology : a new community journal}, volume = {17}, journal = {PLoS Computational Biology : a new community journal}, number = {8}, publisher = {PLoS}, address = {San Fransisco}, issn = {1553-734X}, doi = {10.1371/journal.pcbi.1009268}, pages = {33}, year = {2021}, abstract = {Amoeboid cell motility is essential for a wide range of biological processes including wound healing, embryonic morphogenesis, and cancer metastasis. It relies on complex dynamical patterns of cell shape changes that pose long-standing challenges to mathematical modeling and raise a need for automated and reproducible approaches to extract quantitative morphological features from image sequences. Here, we introduce a theoretical framework and a computational method for obtaining smooth representations of the spatiotemporal contour dynamics from stacks of segmented microscopy images. Based on a Gaussian process regression we propose a one-parameter family of regularized contour flows that allows us to continuously track reference points (virtual markers) between successive cell contours. We use this approach to define a coordinate system on the moving cell boundary and to represent different local geometric quantities in this frame of reference. In particular, we introduce the local marker dispersion as a measure to identify localized membrane expansions and provide a fully automated way to extract the properties of such expansions, including their area and growth time. The methods are available as an open-source software package called AmoePy, a Python-based toolbox for analyzing amoeboid cell motility (based on time-lapse microscopy data), including a graphical user interface and detailed documentation. Due to the mathematical rigor of our framework, we envision it to be of use for the development of novel cell motility models. We mainly use experimental data of the social amoeba Dictyostelium discoideum to illustrate and validate our approach.
Author summary Amoeboid motion is a crawling-like cell migration that plays an important key role in multiple biological processes such as wound healing and cancer metastasis. This type of cell motility results from expanding and simultaneously contracting parts of the cell membrane. From fluorescence images, we obtain a sequence of points, representing the cell membrane, for each time step. By using regression analysis on these sequences, we derive smooth representations, so-called contours, of the membrane. Since the number of measurements is discrete and often limited, the question is raised of how to link consecutive contours with each other. In this work, we present a novel mathematical framework in which these links are described by regularized flows allowing a certain degree of concentration or stretching of neighboring reference points on the same contour. This stretching rate, the so-called local dispersion, is used to identify expansions and contractions of the cell membrane providing a fully automated way of extracting properties of these cell shape changes. We applied our methods to time-lapse microscopy data of the social amoeba Dictyostelium discoideum.}, language = {en} } @article{TavakoliAlirezazadehHedayatipouretal.2021, author = {Tavakoli, Hamad and Alirezazadeh, Pendar and Hedayatipour, Ava and Nasib, A. H. Banijamali and Landwehr, Niels}, title = {Leaf image-based classification of some common bean cultivars using discriminative convolutional neural networks}, series = {Computers and electronics in agriculture : COMPAG online ; an international journal}, volume = {181}, journal = {Computers and electronics in agriculture : COMPAG online ; an international journal}, publisher = {Elsevier}, address = {Amsterdam [u.a.]}, issn = {0168-1699}, doi = {10.1016/j.compag.2020.105935}, pages = {11}, year = {2021}, abstract = {In recent years, many efforts have been made to apply image processing techniques for plant leaf identification. However, categorizing leaf images at the cultivar/variety level, because of the very low inter-class variability, is still a challenging task. In this research, we propose an automatic discriminative method based on convolutional neural networks (CNNs) for classifying 12 different cultivars of common beans that belong to three various species. We show that employing advanced loss functions, such as Additive Angular Margin Loss and Large Margin Cosine Loss, instead of the standard softmax loss function for the classification can yield better discrimination between classes and thereby mitigate the problem of low inter-class variability. The method was evaluated by classifying species (level I), cultivars from the same species (level II), and cultivars from different species (level III), based on images from the leaf foreside and backside. The results indicate that the performance of the classification algorithm on the leaf backside image dataset is superior. The maximum mean classification accuracies of 95.86, 91.37 and 86.87\% were obtained at the levels I, II and III, respectively. The proposed method outperforms the previous relevant works and provides a reliable approach for plant cultivars identification.}, language = {en} } @article{PfitznerSteckhanArnrich2021, author = {Pfitzner, Bjarne and Steckhan, Nico and Arnrich, Bert}, title = {Federated learning in a medical context}, series = {ACM transactions on internet technology : TOIT / Association for Computing}, volume = {21}, journal = {ACM transactions on internet technology : TOIT / Association for Computing}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1533-5399}, doi = {10.1145/3412357}, pages = {1 -- 31}, year = {2021}, abstract = {Data privacy is a very important issue. Especially in fields like medicine, it is paramount to abide by the existing privacy regulations to preserve patients' anonymity. However, data is required for research and training machine learning models that could help gain insight into complex correlations or personalised treatments that may otherwise stay undiscovered. Those models generally scale with the amount of data available, but the current situation often prohibits building large databases across sites. So it would be beneficial to be able to combine similar or related data from different sites all over the world while still preserving data privacy. Federated learning has been proposed as a solution for this, because it relies on the sharing of machine learning models, instead of the raw data itself. That means private data never leaves the site or device it was collected on. Federated learning is an emerging research area, and many domains have been identified for the application of those methods. This systematic literature review provides an extensive look at the concept of and research into federated learning and its applicability for confidential healthcare datasets.}, language = {en} } @article{BonnetDongNaumannetal.2021, author = {Bonnet, Philippe and Dong, Xin Luna and Naumann, Felix and T{\"o}z{\"u}n, P{\i}nar}, title = {VLDB 2021}, series = {SIGMOD record}, volume = {50}, journal = {SIGMOD record}, number = {4}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0163-5808}, doi = {10.1145/3516431.3516447}, pages = {50 -- 53}, year = {2021}, abstract = {The 47th International Conference on Very Large Databases (VLDB'21) was held on August 16-20, 2021 as a hybrid conference. It attracted 180 in-person attendees in Copenhagen and 840 remote attendees. In this paper, we describe our key decisions as general chairs and program committee chairs and share the lessons we learned.}, language = {en} } @article{CabalarFandinoFarinasdelCerro2021, author = {Cabalar, Pedro and Fandi{\~n}o, Jorge and Fari{\~n}as del Cerro, Luis}, title = {Splitting epistemic logic programs}, series = {Theory and practice of logic programming / publ. for the Association for Logic Programming}, volume = {21}, journal = {Theory and practice of logic programming / publ. for the Association for Logic Programming}, number = {3}, publisher = {Cambridge Univ. Press}, address = {Cambridge [u.a.]}, issn = {1471-0684}, doi = {10.1017/S1471068420000058}, pages = {296 -- 316}, year = {2021}, abstract = {Epistemic logic programs constitute an extension of the stable model semantics to deal with new constructs called subjective literals. Informally speaking, a subjective literal allows checking whether some objective literal is true in all or some stable models. As it can be imagined, the associated semantics has proved to be non-trivial, since the truth of subjective literals may interfere with the set of stable models it is supposed to query. As a consequence, no clear agreement has been reached and different semantic proposals have been made in the literature. Unfortunately, comparison among these proposals has been limited to a study of their effect on individual examples, rather than identifying general properties to be checked. In this paper, we propose an extension of the well-known splitting property for logic programs to the epistemic case. We formally define when an arbitrary semantics satisfies the epistemic splitting property and examine some of the consequences that can be derived from that, including its relation to conformant planning and to epistemic constraints. Interestingly, we prove (through counterexamples) that most of the existing approaches fail to fulfill the epistemic splitting property, except the original semantics proposed by Gelfond 1991 and a recent proposal by the authors, called Founded Autoepistemic Equilibrium Logic.}, language = {en} } @article{GoebelLagodzinskiSeidel2021, author = {G{\"o}bel, Andreas and Lagodzinski, Julius Albert Gregor and Seidel, Karen}, title = {Counting homomorphisms to trees modulo a prime}, series = {ACM transactions on computation theory : TOCT / Association for Computing Machinery}, volume = {13}, journal = {ACM transactions on computation theory : TOCT / Association for Computing Machinery}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1942-3454}, doi = {10.1145/3460958}, pages = {1 -- 33}, year = {2021}, abstract = {Many important graph-theoretic notions can be encoded as counting graph homomorphism problems, such as partition functions in statistical physics, in particular independent sets and colourings. In this article, we study the complexity of \#(p) HOMSTOH, the problem of counting graph homomorphisms from an input graph to a graph H modulo a prime number p. Dyer and Greenhill proved a dichotomy stating that the tractability of non-modular counting graph homomorphisms depends on the structure of the target graph. Many intractable cases in non-modular counting become tractable in modular counting due to the common phenomenon of cancellation. In subsequent studies on counting modulo 2, however, the influence of the structure of H on the tractability was shown to persist, which yields similar dichotomies.
Our main result states that for every tree H and every prime p the problem \#pHOMSTOH is either polynomial time computable or \#P-p-complete. This relates to the conjecture of Faben and Jerrum stating that this dichotomy holds for every graph H when counting modulo 2. In contrast to previous results on modular counting, the tractable cases of \#pHOMSTOH are essentially the same for all values of the modulo when H is a tree. To prove this result, we study the structural properties of a homomorphism. As an important interim result, our study yields a dichotomy for the problem of counting weighted independent sets in a bipartite graph modulo some prime p. These results are the first suggesting that such dichotomies hold not only for the modulo 2 case but also for the modular counting functions of all primes p.}, language = {en} } @article{NguyenGeorgieKayhanetal.2021, author = {Nguyen, Dong Hai Phuong and Georgie, Yasmin Kim and Kayhan, Ezgi and Eppe, Manfred and Hafner, Verena Vanessa and Wermter, Stefan}, title = {Sensorimotor representation learning for an "active self" in robots}, series = {K{\"u}nstliche Intelligenz : KI ; Forschung, Entwicklung, Erfahrungen ; Organ des Fachbereichs 1 K{\"u}nstliche Intelligenz der Gesellschaft f{\"u}r Informatik e.V., GI / Fachbereich 1 der Gesellschaft f{\"u}r Informatik e.V}, volume = {35}, journal = {K{\"u}nstliche Intelligenz : KI ; Forschung, Entwicklung, Erfahrungen ; Organ des Fachbereichs 1 K{\"u}nstliche Intelligenz der Gesellschaft f{\"u}r Informatik e.V., GI / Fachbereich 1 der Gesellschaft f{\"u}r Informatik e.V}, number = {1}, publisher = {Springer}, address = {Berlin}, issn = {0933-1875}, doi = {10.1007/s13218-021-00703-z}, pages = {9 -- 35}, year = {2021}, abstract = {Safe human-robot interactions require robots to be able to learn how to behave appropriately in spaces populated by people and thus to cope with the challenges posed by our dynamic and unstructured environment, rather than being provided a rigid set of rules for operations. In humans, these capabilities are thought to be related to our ability to perceive our body in space, sensing the location of our limbs during movement, being aware of other objects and agents, and controlling our body parts to interact with them intentionally. Toward the next generation of robots with bio-inspired capacities, in this paper, we first review the developmental processes of underlying mechanisms of these abilities: The sensory representations of body schema, peripersonal space, and the active self in humans. Second, we provide a survey of robotics models of these sensory representations and robotics models of the self; and we compare these models with the human counterparts. Finally, we analyze what is missing from these robotics models and propose a theoretical computational framework, which aims to allow the emergence of the sense of self in artificial agents by developing sensory representations through self-exploration.}, language = {en} } @article{OmranianAngeleskaNikoloski2021, author = {Omranian, Sara and Angeleska, Angela and Nikoloski, Zoran}, title = {PC2P}, series = {Bioinformatics}, volume = {37}, journal = {Bioinformatics}, number = {1}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btaa1089}, pages = {73 -- 81}, year = {2021}, abstract = {Motivation: Prediction of protein complexes from protein-protein interaction (PPI) networks is an important problem in systems biology, as they control different cellular functions. The existing solutions employ algorithms for network community detection that identify dense subgraphs in PPI networks. However, gold standards in yeast and human indicate that protein complexes can also induce sparse subgraphs, introducing further challenges in protein complex prediction. Results: To address this issue, we formalize protein complexes as biclique spanned subgraphs, which include both sparse and dense subgraphs. We then cast the problem of protein complex prediction as a network partitioning into biclique spanned subgraphs with removal of minimum number of edges, called coherent partition. Since finding a coherent partition is a computationally intractable problem, we devise a parameter-free greedy approximation algorithm, termed Protein Complexes from Coherent Partition (PC2P), based on key properties of biclique spanned subgraphs. Through comparison with nine contenders, we demonstrate that PC2P: (i) successfully identifies modular structure in networks, as a prerequisite for protein complex prediction, (ii) outperforms the existing solutions with respect to a composite score of five performance measures on 75\% and 100\% of the analyzed PPI networks and gold standards in yeast and human, respectively, and (iii,iv) does not compromise GO semantic similarity and enrichment score of the predicted protein complexes. Therefore, our study demonstrates that clustering of networks in terms of biclique spanned subgraphs is a promising framework for detection of complexes in PPI networks.}, language = {en} } @article{TrautmannZhouBrahmsetal.2021, author = {Trautmann, Justin and Zhou, Lin and Brahms, Clemens Markus and Tunca, Can and Ersoy, Cem and Granacher, Urs and Arnrich, Bert}, title = {TRIPOD}, series = {Data : open access ʻData in scienceʼ journal}, volume = {6}, journal = {Data : open access ʻData in scienceʼ journal}, number = {9}, publisher = {MDPI}, address = {Basel}, issn = {2306-5729}, doi = {10.3390/data6090095}, pages = {19}, year = {2021}, abstract = {Inertial measurement units (IMUs) enable easy to operate and low-cost data recording for gait analysis. When combined with treadmill walking, a large number of steps can be collected in a controlled environment without the need of a dedicated gait analysis laboratory. In order to evaluate existing and novel IMU-based gait analysis algorithms for treadmill walking, a reference dataset that includes IMU data as well as reliable ground truth measurements for multiple participants and walking speeds is needed. This article provides a reference dataset consisting of 15 healthy young adults who walked on a treadmill at three different speeds. Data were acquired using seven IMUs placed on the lower body, two different reference systems (Zebris FDMT-HQ and OptoGait), and two RGB cameras. Additionally, in order to validate an existing IMU-based gait analysis algorithm using the dataset, an adaptable modular data analysis pipeline was built. Our results show agreement between the pressure-sensitive Zebris and the photoelectric OptoGait system (r = 0.99), demonstrating the quality of our reference data. As a use case, the performance of an algorithm originally designed for overground walking was tested on treadmill data using the data pipeline. The accuracy of stride length and stride time estimations was comparable to that reported in other studies with overground data, indicating that the algorithm is equally applicable to treadmill data. The Python source code of the data pipeline is publicly available, and the dataset will be provided by the authors upon request, enabling future evaluations of IMU gait analysis algorithms without the need of recording new data.}, language = {en} } @article{DeFreitasJohnsonGoldenetal.2021, author = {De Freitas, Jessica K. and Johnson, Kipp W. and Golden, Eddye and Nadkarni, Girish N. and Dudley, Joel T. and B{\"o}ttinger, Erwin and Glicksberg, Benjamin S. and Miotto, Riccardo}, title = {Phe2vec}, series = {Patterns}, volume = {2}, journal = {Patterns}, number = {9}, publisher = {Elsevier}, address = {Amsterdam}, issn = {2666-3899}, doi = {10.1016/j.patter.2021.100337}, pages = {9}, year = {2021}, abstract = {Robust phenotyping of patients from electronic health records (EHRs) at scale is a challenge in clinical informatics. Here, we introduce Phe2vec, an automated framework for disease phenotyping from EHRs based on unsupervised learning and assess its effectiveness against standard rule-based algorithms from Phenotype KnowledgeBase (PheKB). Phe2vec is based on pre-computing embeddings of medical concepts and patients' clinical history. Disease phenotypes are then derived from a seed concept and its neighbors in the embedding space. Patients are linked to a disease if their embedded representation is close to the disease phenotype. Comparing Phe2vec and PheKB cohorts head-to-head using chart review, Phe2vec performed on par or better in nine out of ten diseases. Differently from other approaches, it can scale to any condition and was validated against widely adopted expert-based standards. Phe2vec aims to optimize clinical informatics research by augmenting current frameworks to characterize patients by condition and derive reliable disease cohorts.}, language = {en} } @article{FreitasdaCruzPfahringerMartensenetal.2021, author = {Freitas da Cruz, Harry and Pfahringer, Boris and Martensen, Tom and Schneider, Frederic and Meyer, Alexander and B{\"o}ttinger, Erwin and Schapranow, Matthieu-Patrick}, title = {Using interpretability approaches to update "black-box" clinical prediction models}, series = {Artificial intelligence in medicine : AIM}, volume = {111}, journal = {Artificial intelligence in medicine : AIM}, publisher = {Elsevier}, address = {Amsterdam}, issn = {0933-3657}, doi = {10.1016/j.artmed.2020.101982}, pages = {13}, year = {2021}, abstract = {Despite advances in machine learning-based clinical prediction models, only few of such models are actually deployed in clinical contexts. Among other reasons, this is due to a lack of validation studies. In this paper, we present and discuss the validation results of a machine learning model for the prediction of acute kidney injury in cardiac surgery patients initially developed on the MIMIC-III dataset when applied to an external cohort of an American research hospital. To help account for the performance differences observed, we utilized interpretability methods based on feature importance, which allowed experts to scrutinize model behavior both at the global and local level, making it possible to gain further insights into why it did not behave as expected on the validation cohort. The knowledge gleaned upon derivation can be potentially useful to assist model update during validation for more generalizable and simpler models. We argue that interpretability methods should be considered by practitioners as a further tool to help explain performance differences and inform model update in validation studies.}, language = {en} } @article{BorchertMockTomczaketal.2021, author = {Borchert, Florian and Mock, Andreas and Tomczak, Aurelie and H{\"u}gel, Jonas and Alkarkoukly, Samer and Knurr, Alexander and Volckmar, Anna-Lena and Stenzinger, Albrecht and Schirmacher, Peter and Debus, J{\"u}rgen and J{\"a}ger, Dirk and Longerich, Thomas and Fr{\"o}hling, Stefan and Eils, Roland and Bougatf, Nina and Sax, Ulrich and Schapranow, Matthieu-Patrick}, title = {Knowledge bases and software support for variant interpretation in precision oncology}, series = {Briefings in bioinformatics}, volume = {22}, journal = {Briefings in bioinformatics}, number = {6}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1467-5463}, doi = {10.1093/bib/bbab134}, pages = {17}, year = {2021}, abstract = {Precision oncology is a rapidly evolving interdisciplinary medical specialty. Comprehensive cancer panels are becoming increasingly available at pathology departments worldwide, creating the urgent need for scalable cancer variant annotation and molecularly informed treatment recommendations. A wealth of mainly academia-driven knowledge bases calls for software tools supporting the multi-step diagnostic process. We derive a comprehensive list of knowledge bases relevant for variant interpretation by a review of existing literature followed by a survey among medical experts from university hospitals in Germany. In addition, we review cancer variant interpretation tools, which integrate multiple knowledge bases. We categorize the knowledge bases along the diagnostic process in precision oncology and analyze programmatic access options as well as the integration of knowledge bases into software tools. The most commonly used knowledge bases provide good programmatic access options and have been integrated into a range of software tools. For the wider set of knowledge bases, access options vary across different parts of the diagnostic process. Programmatic access is limited for information regarding clinical classifications of variants and for therapy recommendations. The main issue for databases used for biological classification of pathogenic variants and pathway context information is the lack of standardized interfaces. There is no single cancer variant interpretation tool that integrates all identified knowledge bases. Specialized tools are available and need to be further developed for different steps in the diagnostic process.}, language = {en} } @inproceedings{GundlachAbramova2021, author = {Gundlach, Jana and Abramova, Olga}, title = {Newsfeed clutter as an inhibitor of sensemaking}, series = {AMCIS Proceedings 2021}, booktitle = {AMCIS Proceedings 2021}, publisher = {AIS}, address = {Atlanta}, isbn = {978-1-7336325-8-4}, pages = {10}, year = {2021}, abstract = {As a central functionality of SNSs, the newsfeed is responsible for the way, how content is presented. This paper investigates the implications of current content presentation on Facebook, which has appeared to be a matter of users' criticism. Leaning on the communication theory, we conceptualize clutter on a newsfeed as noise that hinders the receiver's adequate message decoding (i.e., sensemaking). We further operationalize newsfeed clutter via perceived disorder, information overload, and system feature overload. Our participants browsed their Facebook newsfeed for at least 5 minutes. The follow-up survey results provide partial support for our hypotheses, with only perceived disorder significantly associated with lower sensemaking. These findings shed new light on user experience and underpin the importance of SNSs as communication systems, adding to the existent literature on the dark sides of social media.}, language = {en} } @inproceedings{Brinkmann2021, author = {Brinkmann, Maik}, title = {Relevance of public administrations}, series = {Proceedings of the 54th Hawaii International Conference on System Sciences 2021}, booktitle = {Proceedings of the 54th Hawaii International Conference on System Sciences 2021}, publisher = {University of Hawaiʻi at Mānoa}, address = {Honolulu, HI}, isbn = {978-0-9981331-4-0}, doi = {10.24251/HICSS.2021.285}, pages = {10}, year = {2021}, abstract = {Power relations within the area of blockchain governance are complex by definition and a comprehensive analysis that links technological and institutional elements is missing to date. The research that is presented with this article focuses on the visualization of the shifting power relations with the introduction of blockchain. For this purpose, the analysis leverages an adjusted version of the multi-stakeholder influence mapping tool. The analysis considers the various stakeholders within the multi-layered blockchain technology stack and compares three fundamental blockchain scenarios, including public and private blockchain settings. The findings show that public administrations face indeed less power with the introduction of blockchain, while new stakeholders come into play who wield influence rather uncontrolled. Nonetheless, public administrations are not powerless overall and remain influential stakeholders. This paper concludes that blockchain governance is not as democratic as blockchain enthusiasts tend to argue and derives corresponding opportunities for further research.}, language = {en} } @phdthesis{Hecher2021, author = {Hecher, Markus}, title = {Advanced tools and methods for treewidth-based problem solving}, doi = {10.25932/publishup-51251}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-512519}, school = {Universit{\"a}t Potsdam}, pages = {xv, 184}, year = {2021}, abstract = {In the last decades, there was a notable progress in solving the well-known Boolean satisfiability (Sat) problem, which can be witnessed by powerful Sat solvers. One of the reasons why these solvers are so fast are structural properties of instances that are utilized by the solver's interna. This thesis deals with the well-studied structural property treewidth, which measures the closeness of an instance to being a tree. In fact, there are many problems parameterized by treewidth that are solvable in polynomial time in the instance size when parameterized by treewidth. In this work, we study advanced treewidth-based methods and tools for problems in knowledge representation and reasoning (KR). Thereby, we provide means to establish precise runtime results (upper bounds) for canonical problems relevant to KR. Then, we present a new type of problem reduction, which we call decomposition-guided (DG) that allows us to precisely monitor the treewidth when reducing from one problem to another problem. This new reduction type will be the basis for a long-open lower bound result for quantified Boolean formulas and allows us to design a new methodology for establishing runtime lower bounds for problems parameterized by treewidth. Finally, despite these lower bounds, we provide an efficient implementation of algorithms that adhere to treewidth. Our approach finds suitable abstractions of instances, which are subsequently refined in a recursive fashion, and it uses Sat solvers for solving subproblems. It turns out that our resulting solver is quite competitive for two canonical counting problems related to Sat.}, language = {en} } @article{SchneiderWenigPapenbrock2021, author = {Schneider, Johannes and Wenig, Phillip and Papenbrock, Thorsten}, title = {Distributed detection of sequential anomalies in univariate time series}, series = {The VLDB journal : the international journal on very large data bases}, volume = {30}, journal = {The VLDB journal : the international journal on very large data bases}, number = {4}, publisher = {Springer}, address = {Berlin}, issn = {1066-8888}, doi = {10.1007/s00778-021-00657-6}, pages = {579 -- 602}, year = {2021}, abstract = {The automated detection of sequential anomalies in time series is an essential task for many applications, such as the monitoring of technical systems, fraud detection in high-frequency trading, or the early detection of disease symptoms. All these applications require the detection to find all sequential anomalies possibly fast on potentially very large time series. In other words, the detection needs to be effective, efficient and scalable w.r.t. the input size. Series2Graph is an effective solution based on graph embeddings that are robust against re-occurring anomalies and can discover sequential anomalies of arbitrary length and works without training data. Yet, Series2Graph is no t scalable due to its single-threaded approach; it cannot, in particular, process arbitrarily large sequences due to the memory constraints of a single machine. In this paper, we propose our distributed anomaly detection system, short DADS, which is an efficient and scalable adaptation of Series2Graph. Based on the actor programming model, DADS distributes the input time sequence, intermediate state and the computation to all processors of a cluster in a way that minimizes communication costs and synchronization barriers. Our evaluation shows that DADS is orders of magnitude faster than S2G, scales almost linearly with the number of processors in the cluster and can process much larger input sequences due to its scale-out property.}, language = {en} } @article{FandinoLaferriereRomeroetal.2021, author = {Fandi{\~n}o, Jorge and Laferriere, Francois and Romero, Javier and Schaub, Torsten and Son, Tran Cao}, title = {Planning with incomplete information in quantified answer set programming}, series = {Theory and practice of logic programming}, volume = {21}, journal = {Theory and practice of logic programming}, number = {5}, publisher = {Cambridge University Press}, address = {Cambridge}, issn = {1471-0684}, doi = {10.1017/S1471068421000259}, pages = {663 -- 679}, year = {2021}, abstract = {We present a general approach to planning with incomplete information in Answer Set Programming (ASP). More precisely, we consider the problems of conformant and conditional planning with sensing actions and assumptions. We represent planning problems using a simple formalism where logic programs describe the transition function between states, the initial states and the goal states. For solving planning problems, we use Quantified Answer Set Programming (QASP), an extension of ASP with existential and universal quantifiers over atoms that is analogous to Quantified Boolean Formulas (QBFs). We define the language of quantified logic programs and use it to represent the solutions different variants of conformant and conditional planning. On the practical side, we present a translation-based QASP solver that converts quantified logic programs into QBFs and then executes a QBF solver, and we evaluate experimentally the approach on conformant and conditional planning benchmarks.}, language = {en} } @inproceedings{DehnertGleissReiss2021, author = {Dehnert, Maik and Gleiß, Alexander and Reiss, Frederick}, title = {What makes a data-driven business model?}, series = {ECIS Proceedings 2021}, booktitle = {ECIS Proceedings 2021}, publisher = {AIS}, address = {Atlanta}, isbn = {978-1-7336325-6-0}, year = {2021}, abstract = {The usage of data to improve or create business models has become vital for companies in the 21st century. However, to extract value from data it is important to understand the business model. Taxonomies for data-driven business models (DDBM) aim to provide guidance for the development and ideation of new business models relying on data. In IS research, however, different taxonomies have emerged in recent years, partly redundant, partly contradictory. Thus, there is a need to synthesize the common ground of these taxonomies within IS research. Based on 26 IS-related taxonomies and 30 cases, we derive and define 14 generic building blocks of DDBM to develop a consolidated taxonomy that represents the current state-of-the-art. Thus, we integrate existing research on DDBM and provide avenues for further exploration of data-induced potentials for business models as well as for the development and analysis of general or industry-specific DDBM.}, language = {en} } @book{MeinelDoellnerWeskeetal.2021, author = {Meinel, Christoph and D{\"o}llner, J{\"u}rgen Roland Friedrich and Weske, Mathias and Polze, Andreas and Hirschfeld, Robert and Naumann, Felix and Giese, Holger and Baudisch, Patrick and Friedrich, Tobias and B{\"o}ttinger, Erwin and Lippert, Christoph and D{\"o}rr, Christian and Lehmann, Anja and Renard, Bernhard and Rabl, Tilmann and Uebernickel, Falk and Arnrich, Bert and H{\"o}lzle, Katharina}, title = {Proceedings of the HPI Research School on Service-oriented Systems Engineering 2020 Fall Retreat}, number = {138}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-513-2}, issn = {1613-5652}, doi = {10.25932/publishup-50413}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-504132}, publisher = {Universit{\"a}t Potsdam}, pages = {vi, 144}, year = {2021}, abstract = {Design and Implementation of service-oriented architectures imposes a huge number of research questions from the fields of software engineering, system analysis and modeling, adaptability, and application integration. Component orientation and web services are two approaches for design and realization of complex web-based system. Both approaches allow for dynamic application adaptation as well as integration of enterprise application. Service-Oriented Systems Engineering represents a symbiosis of best practices in object-orientation, component-based development, distributed computing, and business process management. It provides integration of business and IT concerns. The annual Ph.D. Retreat of the Research School provides each member the opportunity to present his/her current state of their research and to give an outline of a prospective Ph.D. thesis. Due to the interdisciplinary structure of the research school, this technical report covers a wide range of topics. These include but are not limited to: Human Computer Interaction and Computer Vision as Service; Service-oriented Geovisualization Systems; Algorithm Engineering for Service-oriented Systems; Modeling and Verification of Self-adaptive Service-oriented Systems; Tools and Methods for Software Engineering in Service-oriented Systems; Security Engineering of Service-based IT Systems; Service-oriented Information Systems; Evolutionary Transition of Enterprise Applications to Service Orientation; Operating System Abstractions for Service-oriented Computing; and Services Specification, Composition, and Enactment.}, language = {en} } @article{KayaFreitag2022, author = {Kaya, Adem and Freitag, Melina A.}, title = {Conditioning analysis for discrete Helmholtz problems}, series = {Computers and mathematics with applications : an international journal}, volume = {118}, journal = {Computers and mathematics with applications : an international journal}, publisher = {Elsevier Science}, address = {Amsterdam}, issn = {0898-1221}, doi = {10.1016/j.camwa.2022.05.016}, pages = {171 -- 182}, year = {2022}, abstract = {In this paper, we examine conditioning of the discretization of the Helmholtz problem. Although the discrete Helmholtz problem has been studied from different perspectives, to the best of our knowledge, there is no conditioning analysis for it. We aim to fill this gap in the literature. We propose a novel method in 1D to observe the near-zero eigenvalues of a symmetric indefinite matrix. Standard classification of ill-conditioning based on the matrix condition number is not true for the discrete Helmholtz problem. We relate the ill-conditioning of the discretization of the Helmholtz problem with the condition number of the matrix. We carry out analytical conditioning analysis in 1D and extend our observations to 2D with numerical observations. We examine several discretizations. We find different regions in which the condition number of the problem shows different characteristics. We also explain the general behavior of the solutions in these regions.}, language = {en} } @article{MattisBeckmannReinetal.2022, author = {Mattis, Toni and Beckmann, Tom and Rein, Patrick and Hirschfeld, Robert}, title = {First-class concepts}, series = {Journal of object technology : JOT / ETH Z{\"u}rich, Department of Computer Science}, volume = {21}, journal = {Journal of object technology : JOT / ETH Z{\"u}rich, Department of Computer Science}, number = {2}, publisher = {ETH Z{\"u}rich, Department of Computer Science}, address = {Z{\"u}rich}, issn = {1660-1769}, doi = {10.5381/jot.2022.21.2.a6}, pages = {1 -- 15}, year = {2022}, abstract = {Ideally, programs are partitioned into independently maintainable and understandable modules. As a system grows, its architecture gradually loses the capability to accommodate new concepts in a modular way. While refactoring is expensive and not always possible, and the programming language might lack dedicated primary language constructs to express certain cross-cutting concerns, programmers are still able to explain and delineate convoluted concepts through secondary means: code comments, use of whitespace and arrangement of code, documentation, or communicating tacit knowledge.
Secondary constructs are easy to change and provide high flexibility in communicating cross-cutting concerns and other concepts among programmers. However, such secondary constructs usually have no reified representation that can be explored and manipulated as first-class entities through the programming environment.
In this exploratory work, we discuss novel ways to express a wide range of concepts, including cross-cutting concerns, patterns, and lifecycle artifacts independently of the dominant decomposition imposed by an existing architecture. We propose the representation of concepts as first-class objects inside the programming environment that retain the capability to change as easily as code comments. We explore new tools that allow programmers to view, navigate, and change programs based on conceptual perspectives. In a small case study, we demonstrate how such views can be created and how the programming experience changes from draining programmers' attention by stretching it across multiple modules toward focusing it on cohesively presented concepts. Our designs are geared toward facilitating multiple secondary perspectives on a system to co-exist in symbiosis with the original architecture, hence making it easier to explore, understand, and explain complex contexts and narratives that are hard or impossible to express using primary modularity constructs.}, language = {en} } @article{SchmidlPapenbrock2022, author = {Schmidl, Sebastian and Papenbrock, Thorsten}, title = {Efficient distributed discovery of bidirectional order dependencies}, series = {The VLDB journal}, volume = {31}, journal = {The VLDB journal}, number = {1}, publisher = {Springer}, address = {Berlin ; Heidelberg ; New York}, issn = {1066-8888}, doi = {10.1007/s00778-021-00683-4}, pages = {49 -- 74}, year = {2022}, abstract = {Bidirectional order dependencies (bODs) capture order relationships between lists of attributes in a relational table. They can express that, for example, sorting books by publication date in ascending order also sorts them by age in descending order. The knowledge about order relationships is useful for many data management tasks, such as query optimization, data cleaning, or consistency checking. Because the bODs of a specific dataset are usually not explicitly given, they need to be discovered. The discovery of all minimal bODs (in set-based canonical form) is a task with exponential complexity in the number of attributes, though, which is why existing bOD discovery algorithms cannot process datasets of practically relevant size in a reasonable time. In this paper, we propose the distributed bOD discovery algorithm DISTOD, whose execution time scales with the available hardware. DISTOD is a scalable, robust, and elastic bOD discovery approach that combines efficient pruning techniques for bOD candidates in set-based canonical form with a novel, reactive, and distributed search strategy. Our evaluation on various datasets shows that DISTOD outperforms both single-threaded and distributed state-of-the-art bOD discovery algorithms by up to orders of magnitude; it can, in particular, process much larger datasets.}, language = {en} } @phdthesis{Dreseler2022, author = {Dreseler, Markus}, title = {Automatic tiering for in-memory database systems}, doi = {10.25932/publishup-55825}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-558253}, school = {Universit{\"a}t Potsdam}, pages = {vii, 143}, year = {2022}, abstract = {A decade ago, it became feasible to store multi-terabyte databases in main memory. These in-memory databases (IMDBs) profit from DRAM's low latency and high throughput as well as from the removal of costly abstractions used in disk-based systems, such as the buffer cache. However, as the DRAM technology approaches physical limits, scaling these databases becomes difficult. Non-volatile memory (NVM) addresses this challenge. This new type of memory is persistent, has more capacity than DRAM (4x), and does not suffer from its density-inhibiting limitations. Yet, as NVM has a higher latency (5-15x) and a lower throughput (0.35x), it cannot fully replace DRAM. IMDBs thus need to navigate the trade-off between the two memory tiers. We present a solution to this optimization problem. Leveraging information about access frequencies and patterns, our solution utilizes NVM's additional capacity while minimizing the associated access costs. Unlike buffer cache-based implementations, our tiering abstraction does not add any costs when reading data from DRAM. As such, it can act as a drop-in replacement for existing IMDBs. Our contributions are as follows: (1) As the foundation for our research, we present Hyrise, an open-source, columnar IMDB that we re-engineered and re-wrote from scratch. Hyrise enables realistic end-to-end benchmarks of SQL workloads and offers query performance which is competitive with other research and commercial systems. At the same time, Hyrise is easy to understand and modify as repeatedly demonstrated by its uses in research and teaching. (2) We present a novel memory management framework for different memory and storage tiers. By encapsulating the allocation and access methods of these tiers, we enable existing data structures to be stored on different tiers with no modifications to their implementation. Besides DRAM and NVM, we also support and evaluate SSDs and have made provisions for upcoming technologies such as disaggregated memory. (3) To identify the parts of the data that can be moved to (s)lower tiers with little performance impact, we present a tracking method that identifies access skew both in the row and column dimensions and that detects patterns within consecutive accesses. Unlike existing methods that have substantial associated costs, our access counters exhibit no identifiable overhead in standard benchmarks despite their increased accuracy. (4) Finally, we introduce a tiering algorithm that optimizes the data placement for a given memory budget. In the TPC-H benchmark, this allows us to move 90\% of the data to NVM while the throughput is reduced by only 10.8\% and the query latency is increased by 11.6\%. With this, we outperform approaches that ignore the workload's access skew and access patterns and increase the query latency by 20\% or more. Individually, our contributions provide novel approaches to current challenges in systems engineering and database research. Combining them allows IMDBs to scale past the limits of DRAM while continuing to profit from the benefits of in-memory computing.}, language = {en} } @phdthesis{Boeken2022, author = {B{\"o}ken, Bj{\"o}rn}, title = {Improving prediction accuracy using dynamic information}, doi = {10.25932/publishup-58512}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-585125}, school = {Universit{\"a}t Potsdam}, pages = {xii, 160}, year = {2022}, abstract = {Accurately solving classification problems nowadays is likely to be the most relevant machine learning task. Binary classification separating two classes only is algorithmically simpler but has fewer potential applications as many real-world problems are multi-class. On the reverse, separating only a subset of classes simplifies the classification task. Even though existing multi-class machine learning algorithms are very flexible regarding the number of classes, they assume that the target set Y is fixed and cannot be restricted once the training is finished. On the other hand, existing state-of-the-art production environments are becoming increasingly interconnected with the advance of Industry 4.0 and related technologies such that additional information can simplify the respective classification problems. In light of this, the main aim of this thesis is to introduce dynamic classification that generalizes multi-class classification such that the target class set can be restricted arbitrarily to a non-empty class subset M of Y at any time between two consecutive predictions. This task is solved by a combination of two algorithmic approaches. First, classifier calibration, which transforms predictions into posterior probability estimates that are intended to be well calibrated. The analysis provided focuses on monotonic calibration and in particular corrects wrong statements that appeared in the literature. It also reveals that bin-based evaluation metrics, which became popular in recent years, are unjustified and should not be used at all. Next, the validity of Platt scaling, which is the most relevant parametric calibration approach, is analyzed in depth. In particular, its optimality for classifier predictions distributed according to four different families of probability distributions as well its equivalence with Beta calibration up to a sigmoidal preprocessing are proven. For non-monotonic calibration, extended variants on kernel density estimation and the ensemble method EKDE are introduced. Finally, the calibration techniques are evaluated using a simulation study with complete information as well as on a selection of 46 real-world data sets. Building on this, classifier calibration is applied as part of decomposition-based classification that aims to reduce multi-class problems to simpler (usually binary) prediction tasks. For the involved fusing step performed at prediction time, a new approach based on evidence theory is presented that uses classifier calibration to model mass functions. This allows the analysis of decomposition-based classification against a strictly formal background and to prove closed-form equations for the overall combinations. Furthermore, the same formalism leads to a consistent integration of dynamic class information, yielding a theoretically justified and computationally tractable dynamic classification model. The insights gained from this modeling are combined with pairwise coupling, which is one of the most relevant reduction-based classification approaches, such that all individual predictions are combined with a weight. This not only generalizes existing works on pairwise coupling but also enables the integration of dynamic class information. Lastly, a thorough empirical study is performed that compares all newly introduced approaches to existing state-of-the-art techniques. For this, evaluation metrics for dynamic classification are introduced that depend on corresponding sampling strategies. Thereafter, these are applied during a three-part evaluation. First, support vector machines and random forests are applied on 26 data sets from the UCI Machine Learning Repository. Second, two state-of-the-art deep neural networks are evaluated on five benchmark data sets from a relatively recent reference work. Here, computationally feasible strategies to apply the presented algorithms in combination with large-scale models are particularly relevant because a naive application is computationally intractable. Finally, reference data from a real-world process allowing the inclusion of dynamic class information are collected and evaluated. The results show that in combination with support vector machines and random forests, pairwise coupling approaches yield the best results, while in combination with deep neural networks, differences between the different approaches are mostly small to negligible. Most importantly, all results empirically confirm that dynamic classification succeeds in improving the respective prediction accuracies. Therefore, it is crucial to pass dynamic class information in respective applications, which requires an appropriate digital infrastructure.}, language = {en} } @article{KrauseGrosseDetersBaumannetal.2022, author = {Krause, Hannes-Vincent and Große Deters, Fenne and Baumann, Annika and Krasnova, Hanna}, title = {Active social media use and its impact on well-being}, series = {Journal of computer-mediated communication : a journal of the International Communication Association}, volume = {28}, journal = {Journal of computer-mediated communication : a journal of the International Communication Association}, number = {1}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1083-6101}, doi = {10.1093/jcmc/zmac037}, pages = {12}, year = {2022}, abstract = {Active use of social networking sites (SNSs) has long been assumed to benefit users' well-being. However, this established hypothesis is increasingly being challenged, with scholars criticizing its lack of empirical support and the imprecise conceptualization of active use. Nevertheless, with considerable heterogeneity among existing studies on the hypothesis and causal evidence still limited, a final verdict on its robustness is still pending. To contribute to this ongoing debate, we conducted a week-long randomized control trial with N = 381 adult Instagram users recruited via Prolific. Specifically, we tested how active SNS use, operationalized as picture postings on Instagram, affects different dimensions of well-being. The results depicted a positive effect on users' positive affect but null findings for other well-being outcomes. The findings broadly align with the recent criticism against the active use hypothesis and support the call for a more nuanced view on the impact of SNSs.
Lay Summary Active use of social networking sites (SNSs) has long been assumed to benefit users' well-being. However, this established assumption is increasingly being challenged, with scholars criticizing its lack of empirical support and the imprecise conceptualization of active use. Nevertheless, with great diversity among conducted studies on the hypothesis and a lack of causal evidence, a final verdict on its viability is still pending. To contribute to this ongoing debate, we conducted a week-long experimental investigation with 381 adult Instagram users. Specifically, we tested how posting pictures on Instagram affects different aspects of well-being. The results of this study depicted a positive effect of posting Instagram pictures on users' experienced positive emotions but no effects on other aspects of well-being. The findings broadly align with the recent criticism against the active use hypothesis and support the call for a more nuanced view on the impact of SNSs on users.}, language = {en} } @phdthesis{Plauth2022, author = {Plauth, Max Frederik}, title = {Improving the Accessibility of Heterogeneous System Resources for Application Developers using Programming Abstractions}, doi = {10.25932/publishup-55811}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-558118}, school = {Universit{\"a}t Potsdam}, pages = {ix, 133}, year = {2022}, abstract = {The heterogeneity of today's state-of-the-art computer architectures is confronting application developers with an immense degree of complexity which results from two major challenges. First, developers need to acquire profound knowledge about the programming models or the interaction models associated with each type of heterogeneous system resource to make efficient use thereof. Second, developers must take into account that heterogeneous system resources always need to exchange data with each other in order to work on a problem together. However, this data exchange is always associated with a certain amount of overhead, which is why the amounts of data exchanged should be kept as low as possible. This thesis proposes three programming abstractions to lessen the burdens imposed by these major challenges with the goal of making heterogeneous system resources accessible to a wider range of application developers. The lib842 compression library provides the first method for accessing the compression and decompression facilities of the NX-842 on-chip compression accelerator available in IBM Power CPUs from user space applications running on Linux. Addressing application development of scale-out GPU workloads, the CloudCL framework makes the resources of GPU clusters more accessible by hiding many aspects of distributed computing while enabling application developers to focus on the aspects of the data parallel programming model associated with GPUs. Furthermore, CloudCL is augmented with transparent data compression facilities based on the lib842 library in order to improve the efficiency of data transfers among cluster nodes. The improved data transfer efficiency provided by the integration of transparent data compression yields performance improvements ranging between 1.11x and 2.07x across four data-intensive scale-out GPU workloads. To investigate the impact of programming abstractions for data placement in NUMA systems, a comprehensive evaluation of the PGASUS framework for NUMA-aware C++ application development is conducted. On a wide range of test systems, the evaluation demonstrates that PGASUS does not only improve the developer experience across all workloads, but that it is also capable of outperforming NUMA-agnostic implementations with average performance improvements of 1.56x. Based on these programming abstractions, this thesis demonstrates that by providing a sufficient degree of abstraction, the accessibility of heterogeneous system resources can be improved for application developers without occluding performance-critical properties of the underlying hardware.}, language = {en} } @misc{UllrichVladovaEigelshovenetal.2022, author = {Ullrich, Andr{\´e} and Vladova, Gergana and Eigelshoven, Felix and Renz, Andr{\´e}}, title = {Data mining of scientific research on artificial intelligence in teaching and administration in higher education institutions}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Wirtschafts- und Sozialwissenschaftliche Reihe}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Wirtschafts- und Sozialwissenschaftliche Reihe}, number = {160}, issn = {1867-5808}, doi = {10.25932/publishup-58907}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-589077}, pages = {18}, year = {2022}, abstract = {Teaching and learning as well as administrative processes are still experiencing intensive changes with the rise of artificial intelligence (AI) technologies and its diverse application opportunities in the context of higher education. Therewith, the scientific interest in the topic in general, but also specific focal points rose as well. However, there is no structured overview on AI in teaching and administration processes in higher education institutions that allows to identify major research topics and trends, and concretizing peculiarities and develops recommendations for further action. To overcome this gap, this study seeks to systematize the current scientific discourse on AI in teaching and administration in higher education institutions. This study identified an (1) imbalance in research on AI in educational and administrative contexts, (2) an imbalance in disciplines and lack of interdisciplinary research, (3) inequalities in cross-national research activities, as well as (4) neglected research topics and paths. In this way, a comparative analysis between AI usage in administration and teaching and learning processes, a systematization of the state of research, an identification of research gaps as well as further research path on AI in higher education institutions are contributed to research.}, language = {en} } @phdthesis{Draisbach2022, author = {Draisbach, Uwe}, title = {Efficient duplicate detection and the impact of transitivity}, doi = {10.25932/publishup-57214}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-572140}, school = {Universit{\"a}t Potsdam}, pages = {x, 150}, year = {2022}, abstract = {Duplicate detection describes the process of finding multiple representations of the same real-world entity in the absence of a unique identifier, and has many application areas, such as customer relationship management, genealogy and social sciences, or online shopping. Due to the increasing amount of data in recent years, the problem has become even more challenging on the one hand, but has led to a renaissance in duplicate detection research on the other hand. This thesis examines the effects and opportunities of transitive relationships on the duplicate detection process. Transitivity implies that if record pairs ⟨ri,rj⟩ and ⟨rj,rk⟩ are classified as duplicates, then also record pair ⟨ri,rk⟩ has to be a duplicate. However, this reasoning might contradict with the pairwise classification, which is usually based on the similarity of objects. An essential property of similarity, in contrast to equivalence, is that similarity is not necessarily transitive. First, we experimentally evaluate the effect of an increasing data volume on the threshold selection to classify whether a record pair is a duplicate or non-duplicate. Our experiments show that independently of the pair selection algorithm and the used similarity measure, selecting a suitable threshold becomes more difficult with an increasing number of records due to an increased probability of adding a false duplicate to an existing cluster. Thus, the best threshold changes with the dataset size, and a good threshold for a small (possibly sampled) dataset is not necessarily a good threshold for a larger (possibly complete) dataset. As data grows over time, earlier selected thresholds are no longer a suitable choice, and the problem becomes worse for datasets with larger clusters. Second, we present with the Duplicate Count Strategy (DCS) and its enhancement DCS++ two alternatives to the standard Sorted Neighborhood Method (SNM) for the selection of candidate record pairs. DCS adapts SNMs window size based on the number of detected duplicates and DCS++ uses transitive dependencies to save complex comparisons for finding duplicates in larger clusters. We prove that with a proper (domain- and data-independent!) threshold, DCS++ is more efficient than SNM without loss of effectiveness. Third, we tackle the problem of contradicting pairwise classifications. Usually, the transitive closure is used for pairwise classifications to obtain a transitively closed result set. However, the transitive closure disregards negative classifications. We present three new and several existing clustering algorithms and experimentally evaluate them on various datasets and under various algorithm configurations. The results show that the commonly used transitive closure is inferior to most other clustering algorithms, especially for the precision of results. In scenarios with larger clusters, our proposed EMCC algorithm is, together with Markov Clustering, the best performing clustering approach for duplicate detection, although its runtime is longer than Markov Clustering due to the subexponential time complexity. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.}, language = {en} } @misc{AlLabanRegerLucke2022, author = {Al Laban, Firas and Reger, Martin and Lucke, Ulrike}, title = {Closing the Policy Gap in the Academic Bridge}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Mathematisch-Naturwissenschaftliche Reihe}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Mathematisch-Naturwissenschaftliche Reihe}, number = {1310}, issn = {1866-8372}, doi = {10.25932/publishup-58357}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-583572}, pages = {22}, year = {2022}, abstract = {The highly structured nature of the educational sector demands effective policy mechanisms close to the needs of the field. That is why evidence-based policy making, endorsed by the European Commission under Erasmus+ Key Action 3, aims to make an alignment between the domains of policy and practice. Against this background, this article addresses two issues: First, that there is a vertical gap in the translation of higher-level policies to local strategies and regulations. Second, that there is a horizontal gap between educational domains regarding the policy awareness of individual players. This was analyzed in quantitative and qualitative studies with domain experts from the fields of virtual mobility and teacher training. From our findings, we argue that the combination of both gaps puts the academic bridge from secondary to tertiary education at risk, including the associated knowledge proficiency levels. We discuss the role of digitalization in the academic bridge by asking the question: which value does the involved stakeholders expect from educational policies? As a theoretical basis, we rely on the model of value co-creation for and by stakeholders. We describe the used instruments along with the obtained results and proposed benefits. Moreover, we reflect on the methodology applied, and we finally derive recommendations for future academic bridge policies.}, language = {en} } @phdthesis{Melnichenko2022, author = {Melnichenko, Anna}, title = {Selfish Creation of Realistic Networks}, doi = {10.25932/publishup-54814}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-548141}, school = {Universit{\"a}t Potsdam}, pages = {xi, 175}, year = {2022}, abstract = {Complex networks like the Internet or social networks are fundamental parts of our everyday lives. It is essential to understand their structural properties and how these networks are formed. A game-theoretic approach to network design problems has become of high interest in the last decades. The reason is that many real-world networks are the outcomes of decentralized strategic behavior of independent agents without central coordination. Fabrikant, Luthra, Maneva, Papadimitriou, and Schenker proposed a game-theoretic model aiming to explain the formation of the Internet-like networks. In this model, called the Network Creation Game, agents are associated with nodes of a network. Each agent seeks to maximize her centrality by establishing costly connections to other agents. The model is relatively simple but shows a high potential in modeling complex real-world networks. In this thesis, we contribute to the line of research on variants of the Network Creation Games. Inspired by real-world networks, we propose and analyze several novel network creation models. We aim to understand the impact of certain realistic modeling assumptions on the structure of the created networks and the involved agents' behavior. The first natural additional objective that we consider is the network's robustness. We consider a game where the agents seek to maximize their centrality and, at the same time, the stability of the created network against random edge failure. Our second point of interest is a model that incorporates an underlying geometry. We consider a network creation model where the agents correspond to points in some underlying space and where edge lengths are equal to the distances between the endpoints in that space. The geometric setting captures many physical real-world networks like transport networks and fiber-optic communication networks. We focus on the formation of social networks and consider two models that incorporate particular realistic behavior observed in real-world networks. In the first model, we embed the anti-preferential attachment link formation. Namely, we assume that the cost of the connection is proportional to the popularity of the targeted agent. Our second model is based on the observation that the probability of two persons to connect is inversely proportional to the length of their shortest chain of mutual acquaintances. For each of the four models above, we provide a complete game-theoretical analysis. In particular, we focus on distinctive structural properties of the equilibria, the hardness of computing a best response, the quality of equilibria in comparison to the centrally designed socially optimal networks. We also analyze the game dynamics, i.e., the process of sequential strategic improvements by the agents, and analyze the convergence to an equilibrium state and its properties.}, language = {en} } @phdthesis{Haarmann2022, author = {Haarmann, Stephan}, title = {WICKR: A Joint Semantics for Flexible Processes and Data}, doi = {10.25932/publishup-54613}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-546137}, school = {Universit{\"a}t Potsdam}, pages = {xvii, 191}, year = {2022}, abstract = {Knowledge-intensive business processes are flexible and data-driven. Therefore, traditional process modeling languages do not meet their requirements: These languages focus on highly structured processes in which data plays a minor role. As a result, process-oriented information systems fail to assist knowledge workers on executing their processes. We propose a novel case management approach that combines flexible activity-centric processes with data models, and we provide a joint semantics using colored Petri nets. The approach is suited to model, verify, and enact knowledge-intensive processes and can aid the development of information systems that support knowledge work. Knowledge-intensive processes are human-centered, multi-variant, and data-driven. Typical domains include healthcare, insurances, and law. The processes cannot be fully modeled, since the underlying knowledge is too vast and changes too quickly. Thus, models for knowledge-intensive processes are necessarily underspecified. In fact, a case emerges gradually as knowledge workers make informed decisions. Knowledge work imposes special requirements on modeling and managing respective processes. They include flexibility during design and execution, ad-hoc adaption to unforeseen situations, and the integration of behavior and data. However, the predominantly used process modeling languages (e.g., BPMN) are unsuited for this task. Therefore, novel modeling languages have been proposed. Many of them focus on activities' data requirements and declarative constraints rather than imperative control flow. Fragment-Based Case Management, for example, combines activity-centric imperative process fragments with declarative data requirements. At runtime, fragments can be combined dynamically, and new ones can be added. Yet, no integrated semantics for flexible activity-centric process models and data models exists. In this thesis, Wickr, a novel case modeling approach extending fragment-based Case Management, is presented. It supports batch processing of data, sharing data among cases, and a full-fledged data model with associations and multiplicity constraints. We develop a translational semantics for Wickr targeting (colored) Petri nets. The semantics assert that a case adheres to the constraints in both the process fragments and the data models. Among other things, multiplicity constraints must not be violated. Furthermore, the semantics are extended to multiple cases that operate on shared data. Wickr shows that the data structure may reflect process behavior and vice versa. Based on its semantics, prototypes for executing and verifying case models showcase the feasibility of Wickr. Its applicability to knowledge-intensive and to data-centric processes is evaluated using well-known requirements from related work.}, language = {en} } @article{UllrichVladovaEigelshovenetal.2022, author = {Ullrich, Andr{\´e} and Vladova, Gergana and Eigelshoven, Felix and Renz, Andr{\´e}}, title = {Data mining of scientific research on artificial intelligence in teaching and administration in higher education institutions}, series = {Discover artificial intelligence}, volume = {2}, journal = {Discover artificial intelligence}, publisher = {Springer}, address = {Cham}, issn = {2731-0809}, doi = {10.1007/s44163-022-00031-7}, pages = {18}, year = {2022}, abstract = {Teaching and learning as well as administrative processes are still experiencing intensive changes with the rise of artificial intelligence (AI) technologies and its diverse application opportunities in the context of higher education. Therewith, the scientific interest in the topic in general, but also specific focal points rose as well. However, there is no structured overview on AI in teaching and administration processes in higher education institutions that allows to identify major research topics and trends, and concretizing peculiarities and develops recommendations for further action. To overcome this gap, this study seeks to systematize the current scientific discourse on AI in teaching and administration in higher education institutions. This study identified an (1) imbalance in research on AI in educational and administrative contexts, (2) an imbalance in disciplines and lack of interdisciplinary research, (3) inequalities in cross-national research activities, as well as (4) neglected research topics and paths. In this way, a comparative analysis between AI usage in administration and teaching and learning processes, a systematization of the state of research, an identification of research gaps as well as further research path on AI in higher education institutions are contributed to research.}, language = {en} } @article{BenderKoerppen2022, author = {Bender, Benedict and K{\"o}rppen, Tim}, title = {Integriert statt isoliert}, series = {Digital business : cloud}, volume = {26}, journal = {Digital business : cloud}, number = {1}, publisher = {WIN-Verlag GmbH \& Co. KG}, address = {Vaterstetten}, issn = {2510-344X}, pages = {26 -- 27}, year = {2022}, abstract = {Dass Daten und Analysen Innovationstreiber sind und nicht mehr nur einen Hygienefaktor darstellen, haben viele Unternehmen erkannt. Um Potenziale zu heben, m{\"u}ssen Daten zielf{\"u}hrend integriert werden. Komplexe Systemlandschaften und isolierte Datenbest{\"a}nde erschweren dies. Technologien f{\"u}r die erfolgreiche Umsetzung von datengetriebenem Management m{\"u}ssen richtig eingesetzt werden.}, language = {de} } @article{SteinertStabernack2022, author = {Steinert, Fritjof and Stabernack, Benno}, title = {Architecture of a low latency H.264/AVC video codec for robust ML based image classification how region of interests can minimize the impact of coding artifacts}, series = {Journal of Signal Processing Systems for Signal, Image, and Video Technology}, volume = {94}, journal = {Journal of Signal Processing Systems for Signal, Image, and Video Technology}, number = {7}, publisher = {Springer}, address = {New York}, issn = {1939-8018}, doi = {10.1007/s11265-021-01727-2}, pages = {693 -- 708}, year = {2022}, abstract = {The use of neural networks is considered as the state of the art in the field of image classification. A large number of different networks are available for this purpose, which, appropriately trained, permit a high level of classification accuracy. Typically, these networks are applied to uncompressed image data, since a corresponding training was also carried out using image data of similar high quality. However, if image data contains image errors, the classification accuracy deteriorates drastically. This applies in particular to coding artifacts which occur due to image and video compression. Typical application scenarios for video compression are narrowband transmission channels for which video coding is required but a subsequent classification is to be carried out on the receiver side. In this paper we present a special H.264/Advanced Video Codec (AVC) based video codec that allows certain regions of a picture to be coded with near constant picture quality in order to allow a reliable classification using neural networks, whereas the remaining image will be coded using constant bit rate. We have combined this feature with the ability to run with lowest latency properties, which is usually also required in remote control applications scenarios. The codec has been implemented as a fully hardwired High Definition video capable hardware architecture which is suitable for Field Programmable Gate Arrays.}, language = {en} } @article{BonifatiMiorNaumannetal.2022, author = {Bonifati, Angela and Mior, Michael J. and Naumann, Felix and Noack, Nele Sina}, title = {How inclusive are we?}, series = {SIGMOD record / Association for Computing Machinery, Special Interest Group on Management of Data}, volume = {50}, journal = {SIGMOD record / Association for Computing Machinery, Special Interest Group on Management of Data}, number = {4}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0163-5808}, doi = {10.1145/3516431.3516438}, pages = {30 -- 35}, year = {2022}, abstract = {ACM SIGMOD, VLDB and other database organizations have committed to fostering an inclusive and diverse community, as do many other scientific organizations. Recently, different measures have been taken to advance these goals, especially for underrepresented groups. One possible measure is double-blind reviewing, which aims to hide gender, ethnicity, and other properties of the authors.
We report the preliminary results of a gender diversity analysis of publications of the database community across several peer-reviewed venues, and also compare women's authorship percentages in both single-blind and double-blind venues along the years. We also obtained a cross comparison of the obtained results in data management with other relevant areas in Computer Science.}, language = {en} } @article{AlnoorTiberiusAtiyahetal.2022, author = {Alnoor, Alhamzah and Tiberius, Victor and Atiyah, Abbas Gatea and Khaw, Khai Wah and Yin, Teh Sin and Chew, XinYing and Abbas, Sammar}, title = {How positive and negative electronic word of mouth (eWOM) affects customers' intention to use social commerce?}, series = {International journal of human computer interaction}, journal = {International journal of human computer interaction}, publisher = {Taylor \& Francis}, address = {New York}, issn = {1044-7318}, doi = {10.1080/10447318.2022.2125610}, pages = {1 -- 30}, year = {2022}, abstract = {Advances in Web 2.0 technologies have led to the widespread assimilation of electronic commerce platforms as an innovative shopping method and an alternative to traditional shopping. However, due to pro-technology bias, scholars focus more on adopting technology, and slightly less attention has been given to the impact of electronic word of mouth (eWOM) on customers' intention to use social commerce. This study addresses the gap by examining the intention through exploring the effect of eWOM on males' and females' intentions and identifying the mediation of perceived crowding. To this end, we adopted a dual-stage multi-group structural equation modeling and artificial neural network (SEM-ANN) approach. We successfully extended the eWOM concept by integrating negative and positive factors and perceived crowding. The results reveal the causal and non-compensatory relationships between the constructs. The variables supported by the SEM analysis are adopted as the ANN model's input neurons. According to the natural significance obtained from the ANN approach, males' intentions to accept social commerce are related mainly to helping the company, followed by core functionalities. In contrast, females are highly influenced by technical aspects and mishandling. The ANN model predicts customers' intentions to use social commerce with an accuracy of 97\%. We discuss the theoretical and practical implications of increasing customers' intention toward social commerce channels among consumers based on our findings.}, language = {en} } @book{EichenrothReinHirschfeld2022, author = {Eichenroth, Friedrich and Rein, Patrick and Hirschfeld, Robert}, title = {Fast packrat parsing in a live programming environment}, series = {Technische Berichte des Hasso-Plattner-Instituts f{\"u}r Digital Engineering an der Universit{\"a}t Potsdam}, journal = {Technische Berichte des Hasso-Plattner-Instituts f{\"u}r Digital Engineering an der Universit{\"a}t Potsdam}, number = {135}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-503-3}, issn = {1613-5652}, doi = {10.25932/publishup-49124}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-491242}, publisher = {Universit{\"a}t Potsdam}, pages = {79}, year = {2022}, abstract = {Language developers who design domain-specific languages or new language features need a way to make fast changes to language definitions. Those fast changes require immediate feedback. Also, it should be possible to parse the developed languages quickly to handle extensive sets of code. Parsing expression grammars provides an easy to understand method for language definitions. Packrat parsing is a method to parse grammars of this kind, but this method is unable to handle left-recursion properly. Existing solutions either partially rewrite left-recursive rules and partly forbid them, or use complex extensions to packrat parsing that are hard to understand and cost-intensive. We investigated methods to make parsing as fast as possible, using easy to follow algorithms while not losing the ability to make fast changes to grammars. We focused our efforts on two approaches. One is to start from an existing technique for limited left-recursion rewriting and enhance it to work for general left-recursive grammars. The second approach is to design a grammar compilation process to find left-recursion before parsing, and in this way, reduce computational costs wherever possible and generate ready to use parser classes. Rewriting parsing expression grammars is a task that, if done in a general way, unveils a large number of cases such that any rewriting algorithm surpasses the complexity of other left-recursive parsing algorithms. Lookahead operators introduce this complexity. However, most languages have only little portions that are left-recursive and in virtually all cases, have no indirect or hidden left-recursion. This means that the distinction of left-recursive parts of grammars from components that are non-left-recursive holds great improvement potential for existing parsers. In this report, we list all the required steps for grammar rewriting to handle left-recursion, including grammar analysis, grammar rewriting itself, and syntax tree restructuring. Also, we describe the implementation of a parsing expression grammar framework in Squeak/Smalltalk and the possible interactions with the already existing parser Ohm/S. We quantitatively benchmarked this framework directing our focus on parsing time and the ability to use it in a live programming context. Compared with Ohm, we achieved massive parsing time improvements while preserving the ability to use our parser it as a live programming tool. The work is essential because, for one, we outlined the difficulties and complexity that come with grammar rewriting. Also, we removed the existing limitations that came with left-recursion by eliminating them before parsing.}, language = {en} } @book{FreundRaetschHradilaketal.2022, author = {Freund, Rieke and R{\"a}tsch, Jan Philip and Hradilak, Franziska and Vidic, Benedikt and Heß, Oliver and Lißner, Nils and W{\"o}lert, Hendrik and Lincke, Jens and Beckmann, Tom and Hirschfeld, Robert}, title = {Implementing a crowd-sourced picture archive for Bad Harzburg}, number = {149}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-545-3}, issn = {1613-5652}, doi = {10.25932/publishup-56029}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-560291}, publisher = {Universit{\"a}t Potsdam}, pages = {x, 191}, year = {2022}, abstract = {Pictures are a medium that helps make the past tangible and preserve memories. Without context, they are not able to do so. Pictures are brought to life by their associated stories. However, the older pictures become, the fewer contemporary witnesses can tell these stories. Especially for large, analog picture archives, knowledge and memories are spread over many people. This creates several challenges: First, the pictures must be digitized to save them from decaying and make them available to the public. Since a simple listing of all the pictures is confusing, the pictures should be structured accessibly. Second, known information that makes the stories vivid needs to be added to the pictures. Users should get the opportunity to contribute their knowledge and memories. To make this usable for all interested parties, even for older, less technophile generations, the interface should be intuitive and error-tolerant. The resulting requirements are not covered in their entirety by any existing software solution without losing the intuitive interface or the scalability of the system. Therefore, we have developed our digital picture archive within the scope of a bachelor project in cooperation with the Bad Harzburg-Stiftung. For the implementation of this web application, we use the UI framework React in the frontend, which communicates via a GraphQL interface with the Content Management System Strapi in the backend. The use of this system enables our project partner to create an efficient process from scanning analog pictures to presenting them to visitors in an organized and annotated way. To customize the solution for both picture delivery and information contribution for our target group, we designed prototypes and evaluated them with people from Bad Harzburg. This helped us gain valuable insights into our system's usability and future challenges as well as requirements. Our web application is already being used daily by our project partner. During the project, we still came up with numerous ideas for additional features to further support the exchange of knowledge.}, language = {en} } @book{SchneiderMaximovaGiese2022, author = {Schneider, Sven and Maximova, Maria and Giese, Holger}, title = {Invariant Analysis for Multi-Agent Graph Transformation Systems using k-Induction}, number = {143}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-531-6}, issn = {1613-5652}, doi = {10.25932/publishup-54585}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-545851}, publisher = {Universit{\"a}t Potsdam}, pages = {37}, year = {2022}, abstract = {The analysis of behavioral models such as Graph Transformation Systems (GTSs) is of central importance in model-driven engineering. However, GTSs often result in intractably large or even infinite state spaces and may be equipped with multiple or even infinitely many start graphs. To mitigate these problems, static analysis techniques based on finite symbolic representations of sets of states or paths thereof have been devised. We focus on the technique of k-induction for establishing invariants specified using graph conditions. To this end, k-induction generates symbolic paths backwards from a symbolic state representing a violation of a candidate invariant to gather information on how that violation could have been reached possibly obtaining contradictions to assumed invariants. However, GTSs where multiple agents regularly perform actions independently from each other cannot be analyzed using this technique as of now as the independence among backward steps may prevent the gathering of relevant knowledge altogether. In this paper, we extend k-induction to GTSs with multiple agents thereby supporting a wide range of additional GTSs. As a running example, we consider an unbounded number of shuttles driving on a large-scale track topology, which adjust their velocity to speed limits to avoid derailing. As central contribution, we develop pruning techniques based on causality and independence among backward steps and verify that k-induction remains sound under this adaptation as well as terminates in cases where it did not terminate before.}, language = {en} } @book{SchneiderMaximovaGiese2022, author = {Schneider, Sven and Maximova, Maria and Giese, Holger}, title = {Probabilistic metric temporal graph logic}, number = {146}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-532-3}, issn = {1613-5652}, doi = {10.25932/publishup-54586}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-545867}, publisher = {Universit{\"a}t Potsdam}, pages = {34}, year = {2022}, abstract = {Cyber-physical systems often encompass complex concurrent behavior with timing constraints and probabilistic failures on demand. The analysis whether such systems with probabilistic timed behavior adhere to a given specification is essential. When the states of the system can be represented by graphs, the rule-based formalism of Probabilistic Timed Graph Transformation Systems (PTGTSs) can be used to suitably capture structure dynamics as well as probabilistic and timed behavior of the system. The model checking support for PTGTSs w.r.t. properties specified using Probabilistic Timed Computation Tree Logic (PTCTL) has been already presented. Moreover, for timed graph-based runtime monitoring, Metric Temporal Graph Logic (MTGL) has been developed for stating metric temporal properties on identified subgraphs and their structural changes over time. In this paper, we (a) extend MTGL to the Probabilistic Metric Temporal Graph Logic (PMTGL) by allowing for the specification of probabilistic properties, (b) adapt our MTGL satisfaction checking approach to PTGTSs, and (c) combine the approaches for PTCTL model checking and MTGL satisfaction checking to obtain a Bounded Model Checking (BMC) approach for PMTGL. In our evaluation, we apply an implementation of our BMC approach in AutoGraph to a running example.}, language = {en} } @book{KlinkeVerhoevenRothetal.2022, author = {Klinke, Paula and Verhoeven, Silvan and Roth, Felix and Hagemann, Linus and Alnawa, Tarik and Lincke, Jens and Rein, Patrick and Hirschfeld, Robert}, title = {Tool support for collaborative creation of interactive storytelling media}, number = {141}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-521-7}, issn = {1613-5652}, doi = {10.25932/publishup-51857}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-518570}, publisher = {Universit{\"a}t Potsdam}, pages = {x, 167}, year = {2022}, abstract = {Scrollytellings are an innovative form of web content. Combining the benefits of books, images, movies, and video games, they are a tool to tell compelling stories and provide excellent learning opportunities. Due to their multi-modality, creating high-quality scrollytellings is not an easy task. Different professions, such as content designers, graphics designers, and developers, need to collaborate to get the best out of the possibilities the scrollytelling format provides. Collaboration unlocks great potential. However, content designers cannot create scrollytellings directly and always need to consult with developers to implement their vision. This can result in misunderstandings. Often, the resulting scrollytelling will not match the designer's vision sufficiently, causing unnecessary iterations. Our project partner Typeshift specializes in the creation of individualized scrollytellings for their clients. Examined existing solutions for authoring interactive content are not optimally suited for creating highly customized scrollytellings while still being able to manipulate all their elements programmatically. Based on their experience and expertise, we developed an editor to author scrollytellings in the lively.next live-programming environment. In this environment, a graphical user interface for content design is combined with powerful possibilities for programming behavior with the morphic system. The editor allows content designers to take on large parts of the creation process of scrollytellings on their own, such as creating the visible elements, animating content, and fine-tuning the scrollytelling. Hence, developers can focus on interactive elements such as simulations and games. Together with Typeshift, we evaluated the tool by recreating an existing scrollytelling and identified possible future enhancements. Our editor streamlines the creation process of scrollytellings. Content designers and developers can now both work on the same scrollytelling. Due to the editor inside of the lively.next environment, they can both work with a set of tools familiar to them and their traits. Thus, we mitigate unnecessary iterations and misunderstandings by enabling content designers to realize large parts of their vision of a scrollytelling on their own. Developers can add advanced and individual behavior. Thus, developers and content designers benefit from a clearer distribution of tasks while keeping the benefits of collaboration.}, language = {en} } @book{DuerschReinMattisetal.2022, author = {D{\"u}rsch, Falco and Rein, Patrick and Mattis, Toni and Hirschfeld, Robert}, title = {Learning from failure}, number = {145}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-528-6}, issn = {1613-5652}, doi = {10.25932/publishup-53755}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-537554}, publisher = {Universit{\"a}t Potsdam}, pages = {87}, year = {2022}, abstract = {Regression testing is a widespread practice in today's software industry to ensure software product quality. Developers derive a set of test cases, and execute them frequently to ensure that their change did not adversely affect existing functionality. As the software product and its test suite grow, the time to feedback during regression test sessions increases, and impedes programmer productivity: developers wait longer for tests to complete, and delays in fault detection render fault removal increasingly difficult. Test case prioritization addresses the problem of long feedback loops by reordering test cases, such that test cases of high failure probability run first, and test case failures become actionable early in the testing process. We ask, given test execution schedules reconstructed from publicly available data, to which extent can their fault detection efficiency improved, and which technique yields the most efficient test schedules with respect to APFD? To this end, we recover regression 6200 test sessions from the build log files of Travis CI, a popular continuous integration service, and gather 62000 accompanying changelists. We evaluate the efficiency of current test schedules, and examine the prioritization results of state-of-the-art lightweight, history-based heuristics. We propose and evaluate a novel set of prioritization algorithms, which connect software changes and test failures in a matrix-like data structure. Our studies indicate that the optimization potential is substantial, because the existing test plans score only 30\% APFD. The predictive power of past test failures proves to be outstanding: simple heuristics, such as repeating tests with failures in recent sessions, result in efficiency scores of 95\% APFD. The best-performing matrix-based heuristic achieves a similar score of 92.5\% APFD. In contrast to prior approaches, we argue that matrix-based techniques are useful beyond the scope of effective prioritization, and enable a number of use cases involving software maintenance. We validate our findings from continuous integration processes by extending a continuous testing tool within development environments with means of test prioritization, and pose further research questions. We think that our findings are suited to propel adoption of (continuous) testing practices, and that programmers' toolboxes should contain test prioritization as an existential productivity tool.}, language = {en} } @article{BlaesiusFriedrichLischeidetal.2022, author = {Bl{\"a}sius, Thomas and Friedrich, Tobias and Lischeid, Julius and Meeks, Kitty and Schirneck, Friedrich Martin}, title = {Efficiently enumerating hitting sets of hypergraphs arising in data profiling}, series = {Journal of computer and system sciences : JCSS}, volume = {124}, journal = {Journal of computer and system sciences : JCSS}, publisher = {Elsevier}, address = {San Diego}, issn = {0022-0000}, doi = {10.1016/j.jcss.2021.10.002}, pages = {192 -- 213}, year = {2022}, abstract = {The transversal hypergraph problem asks to enumerate the minimal hitting sets of a hypergraph. If the solutions have bounded size, Eiter and Gottlob [SICOMP'95] gave an algorithm running in output-polynomial time, but whose space requirement also scales with the output. We improve this to polynomial delay and space. Central to our approach is the extension problem, deciding for a set X of vertices whether it is contained in any minimal hitting set. We show that this is one of the first natural problems to be W[3]-complete. We give an algorithm for the extension problem running in time O(m(vertical bar X vertical bar+1) n) and prove a SETH-lower bound showing that this is close to optimal. We apply our enumeration method to the discovery problem of minimal unique column combinations from data profiling. Our empirical evaluation suggests that the algorithm outperforms its worst-case guarantees on hypergraphs stemming from real-world databases.}, language = {en} } @article{vonSteinauSteinrueckKurth2022, author = {von Steinau-Steinr{\"u}ck, Robert and Kurth, Paula Sophie}, title = {Das reformierte Statusfeststellungsverfahren in der Praxis}, series = {NJW spezial}, volume = {19}, journal = {NJW spezial}, number = {24}, publisher = {C.H. Beck}, address = {M{\"u}nchen}, issn = {1613-4621}, pages = {754 -- 755}, year = {2022}, abstract = {Das Statusfeststellungsverfahren erm{\"o}glicht auf Antrag bei der alleinzust{\"a}ndigen Deutschen Rentenversicherung Bund den Erhalt einer verbindlichen Einsch{\"a}tzung der h{\"a}ufig komplizierten und folgenschweren Abgrenzung einer selbstst{\"a}ndigen T{\"a}tigkeit von einer abh{\"a}ngigen Besch{\"a}ftigung. Zum 1.4.2022 wurde das Statusfeststellungsverfahren umfassend reformiert. In der Praxis haben sich die eingef{\"u}hrten Novellierungen bislang unterschiedlich bew{\"a}hrt.}, language = {de} } @article{vonSteinauSteinrueckMiller2022, author = {von Steinau-Steinr{\"u}ck, Robert and Miller, Denis}, title = {R{\"u}ckzahlungsklauseln f{\"u}r Fortbildungen}, series = {Neue juristische Wochenschrift : NJW Spezial}, volume = {19}, journal = {Neue juristische Wochenschrift : NJW Spezial}, number = {12}, publisher = {C.H. Beck}, address = {M{\"u}nchen}, issn = {1613-4621}, pages = {370 -- 371}, year = {2022}, abstract = {Mit Urteil vom 1.3.2022 (NZA2022, NZA Jahr 2022 Seite 780) hat das BAG erneut {\"u}ber die Wirksamkeit einer R{\"u}ckzahlungsklausel in einer Fortbildungsvereinbarung entschieden. Die Entscheidung reiht sich in eine nicht leicht zu durchschauende Anzahl von Urteilen hierzu ein. Sie dient uns zum Anlass, einen {\"U}berblick {\"u}ber die Rechtsprechung zu geben.}, language = {de} } @article{vonSteinauSteinrueckHoeltge2022, author = {von Steinau-Steinr{\"u}ck, Robert and H{\"o}ltge, Clara}, title = {Krieg in Europa}, series = {NJW spezial}, volume = {19}, journal = {NJW spezial}, number = {8}, publisher = {C.H. Beck}, address = {M{\"u}nchen}, issn = {1613-4621}, pages = {242 -- 243}, year = {2022}, abstract = {Am 24.2.2022 begann der russische Angriffskrieg in der Ukraine. Seitdem fliehen t{\"a}glich zahlreiche ukrainische Staatsb{\"u}rger in die Europ{\"a}ische Union, viele davon nach Deutschland. Vorrangig ist jetzt die Sicherung der Grundbed{\"u}rfnisse, wie Verpflegung, Unterkunft und medizinischer Versorgung. Daneben fragen sich Arbeitgeber, wie sie ukrainische Staatsb{\"u}rger m{\"o}glichst schnell besch{\"a}ftigen k{\"o}nnen. Wir geben einen {\"U}berblick {\"u}ber die M{\"o}glichkeiten, ukrainische Gefl{\"u}chtete m{\"o}glichst schnell in den deutschen Arbeitsmarkt zu integrieren.}, language = {de} } @phdthesis{Elsaid2022, author = {Elsaid, Mohamed Esameldin Mohamed}, title = {Virtual machines live migration cost modeling and prediction}, doi = {10.25932/publishup-54001}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-540013}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 107}, year = {2022}, abstract = {Dynamic resource management is an essential requirement for private and public cloud computing environments. With dynamic resource management, the physical resources assignment to the cloud virtual resources depends on the actual need of the applications or the running services, which enhances the cloud physical resources utilization and reduces the offered services cost. In addition, the virtual resources can be moved across different physical resources in the cloud environment without an obvious impact on the running applications or services production. This means that the availability of the running services and applications in the cloud is independent on the hardware resources including the servers, switches and storage failures. This increases the reliability of using cloud services compared to the classical data-centers environments. In this thesis we briefly discuss the dynamic resource management topic and then deeply focus on live migration as the definition of the compute resource dynamic management. Live migration is a commonly used and an essential feature in cloud and virtual data-centers environments. Cloud computing load balance, power saving and fault tolerance features are all dependent on live migration to optimize the virtual and physical resources usage. As we will discuss in this thesis, live migration shows many benefits to cloud and virtual data-centers environments, however the cost of live migration can not be ignored. Live migration cost includes the migration time, downtime, network overhead, power consumption increases and CPU overhead. IT admins run virtual machines live migrations without an idea about the migration cost. So, resources bottlenecks, higher migration cost and migration failures might happen. The first problem that we discuss in this thesis is how to model the cost of the virtual machines live migration. Secondly, we investigate how to make use of machine learning techniques to help the cloud admins getting an estimation of this cost before initiating the migration for one of multiple virtual machines. Also, we discuss the optimal timing for a specific virtual machine before live migration to another server. Finally, we propose practical solutions that can be used by the cloud admins to be integrated with the cloud administration portals to answer the raised research questions above. Our research methodology to achieve the project objectives is to propose empirical models based on using VMware test-beds with different benchmarks tools. Then we make use of the machine learning techniques to propose a prediction approach for virtual machines live migration cost. Timing optimization for live migration is also proposed in this thesis based on using the cost prediction and data-centers network utilization prediction. Live migration with persistent memory clusters is also discussed at the end of the thesis. The cost prediction and timing optimization techniques proposed in this thesis could be practically integrated with VMware vSphere cluster portal such that the IT admins can now use the cost prediction feature and timing optimization option before proceeding with a virtual machine live migration. Testing results show that our proposed approach for VMs live migration cost prediction shows acceptable results with less than 20\% prediction error and can be easily implemented and integrated with VMware vSphere as an example of a commonly used resource management portal for virtual data-centers and private cloud environments. The results show that using our proposed VMs migration timing optimization technique also could save up to 51\% of migration time of the VMs migration time for memory intensive workloads and up to 27\% of the migration time for network intensive workloads. This timing optimization technique can be useful for network admins to save migration time with utilizing higher network rate and higher probability of success. At the end of this thesis, we discuss the persistent memory technology as a new trend in servers memory technology. Persistent memory modes of operation and configurations are discussed in detail to explain how live migration works between servers with different memory configuration set up. Then, we build a VMware cluster with persistent memory inside server and also with DRAM only servers to show the live migration cost difference between the VMs with DRAM only versus the VMs with persistent memory inside.}, language = {en} } @book{MeinelJohnWollowski2022, author = {Meinel, Christoph and John, Catrina and Wollowski, Tobias}, title = {Die HPI Schul-Cloud - Von der Vision zur digitale Infrastruktur f{\"u}r deutsche Schulen}, number = {144}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-526-2}, issn = {1613-5652}, doi = {10.25932/publishup-53586}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-535860}, publisher = {Universit{\"a}t Potsdam}, pages = {v, 77}, year = {2022}, abstract = {Digitale Medien sind aus unserem Alltag kaum noch wegzudenken. Einer der zentralsten Bereiche f{\"u}r unsere Gesellschaft, die schulische Bildung, darf hier nicht hintanstehen. Wann immer der Einsatz digital unterst{\"u}tzter Tools p{\"a}dagogisch sinnvoll ist, muss dieser in einem sicheren Rahmen erm{\"o}glicht werden k{\"o}nnen. Die HPI Schul-Cloud ist dieser Vision gefolgt, die vom Nationalen IT-Gipfel 2016 angestoßen wurde und dem Bericht vorangestellt ist - gefolgt. Sie hat sich in den vergangenen f{\"u}nf Jahren vom Pilotprojekt zur unverzichtbaren IT-Infrastruktur f{\"u}r zahlreiche Schulen entwickelt. W{\"a}hrend der Corona-Pandemie hat sie f{\"u}r viele Tausend Schulen wichtige Unterst{\"u}tzung bei der Umsetzung ihres Bildungsauftrags geboten. Das Ziel, eine zukunftssichere und datenschutzkonforme Infrastruktur zur digitalen Unterst{\"u}tzung des Unterrichts zur Verf{\"u}gung zu stellen, hat sie damit mehr als erreicht. Aktuell greifen rund 1,4 Millionen Lehrkr{\"a}fte und Sch{\"u}lerinnen und Sch{\"u}ler bundesweit und an den deutschen Auslandsschulen auf die HPI Schul-Cloud zu.}, language = {de} } @phdthesis{Bartz2022, author = {Bartz, Christian}, title = {Reducing the annotation burden: deep learning for optical character recognition using less manual annotations}, doi = {10.25932/publishup-55540}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-555407}, school = {Universit{\"a}t Potsdam}, pages = {xxiv, 183}, year = {2022}, abstract = {Text is a ubiquitous entity in our world and daily life. We encounter it nearly everywhere in shops, on the street, or in our flats. Nowadays, more and more text is contained in digital images. These images are either taken using cameras, e.g., smartphone cameras, or taken using scanning devices such as document scanners. The sheer amount of available data, e.g., millions of images taken by Google Streetview, prohibits manual analysis and metadata extraction. Although much progress was made in the area of optical character recognition (OCR) for printed text in documents, broad areas of OCR are still not fully explored and hold many research challenges. With the mainstream usage of machine learning and especially deep learning, one of the most pressing problems is the availability and acquisition of annotated ground truth for the training of machine learning models because obtaining annotated training data using manual annotation mechanisms is time-consuming and costly. In this thesis, we address of how we can reduce the costs of acquiring ground truth annotations for the application of state-of-the-art machine learning methods to optical character recognition pipelines. To this end, we investigate how we can reduce the annotation cost by using only a fraction of the typically required ground truth annotations, e.g., for scene text recognition systems. We also investigate how we can use synthetic data to reduce the need of manual annotation work, e.g., in the area of document analysis for archival material. In the area of scene text recognition, we have developed a novel end-to-end scene text recognition system that can be trained using inexact supervision and shows competitive/state-of-the-art performance on standard benchmark datasets for scene text recognition. Our method consists of two independent neural networks, combined using spatial transformer networks. Both networks learn together to perform text localization and text recognition at the same time while only using annotations for the recognition task. We apply our model to end-to-end scene text recognition (meaning localization and recognition of words) and pure scene text recognition without any changes in the network architecture. In the second part of this thesis, we introduce novel approaches for using and generating synthetic data to analyze handwriting in archival data. First, we propose a novel preprocessing method to determine whether a given document page contains any handwriting. We propose a novel data synthesis strategy to train a classification model and show that our data synthesis strategy is viable by evaluating the trained model on real images from an archive. Second, we introduce the new analysis task of handwriting classification. Handwriting classification entails classifying a given handwritten word image into classes such as date, word, or number. Such an analysis step allows us to select the best fitting recognition model for subsequent text recognition; it also allows us to reason about the semantic content of a given document page without the need for fine-grained text recognition and further analysis steps, such as Named Entity Recognition. We show that our proposed approaches work well when trained on synthetic data. Further, we propose a flexible metric learning approach to allow zero-shot classification of classes unseen during the network's training. Last, we propose a novel data synthesis algorithm to train off-the-shelf pixel-wise semantic segmentation networks for documents. Our data synthesis pipeline is based on the famous Style-GAN architecture and can synthesize realistic document images with their corresponding segmentation annotation without the need for any annotated data!}, language = {en} }