@phdthesis{Bartz2022, author = {Bartz, Christian}, title = {Reducing the annotation burden: deep learning for optical character recognition using less manual annotations}, doi = {10.25932/publishup-55540}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-555407}, school = {Universit{\"a}t Potsdam}, pages = {xxiv, 183}, year = {2022}, abstract = {Text is a ubiquitous entity in our world and daily life. We encounter it nearly everywhere in shops, on the street, or in our flats. Nowadays, more and more text is contained in digital images. These images are either taken using cameras, e.g., smartphone cameras, or taken using scanning devices such as document scanners. The sheer amount of available data, e.g., millions of images taken by Google Streetview, prohibits manual analysis and metadata extraction. Although much progress was made in the area of optical character recognition (OCR) for printed text in documents, broad areas of OCR are still not fully explored and hold many research challenges. With the mainstream usage of machine learning and especially deep learning, one of the most pressing problems is the availability and acquisition of annotated ground truth for the training of machine learning models because obtaining annotated training data using manual annotation mechanisms is time-consuming and costly. In this thesis, we address of how we can reduce the costs of acquiring ground truth annotations for the application of state-of-the-art machine learning methods to optical character recognition pipelines. To this end, we investigate how we can reduce the annotation cost by using only a fraction of the typically required ground truth annotations, e.g., for scene text recognition systems. We also investigate how we can use synthetic data to reduce the need of manual annotation work, e.g., in the area of document analysis for archival material. In the area of scene text recognition, we have developed a novel end-to-end scene text recognition system that can be trained using inexact supervision and shows competitive/state-of-the-art performance on standard benchmark datasets for scene text recognition. Our method consists of two independent neural networks, combined using spatial transformer networks. Both networks learn together to perform text localization and text recognition at the same time while only using annotations for the recognition task. We apply our model to end-to-end scene text recognition (meaning localization and recognition of words) and pure scene text recognition without any changes in the network architecture. In the second part of this thesis, we introduce novel approaches for using and generating synthetic data to analyze handwriting in archival data. First, we propose a novel preprocessing method to determine whether a given document page contains any handwriting. We propose a novel data synthesis strategy to train a classification model and show that our data synthesis strategy is viable by evaluating the trained model on real images from an archive. Second, we introduce the new analysis task of handwriting classification. Handwriting classification entails classifying a given handwritten word image into classes such as date, word, or number. Such an analysis step allows us to select the best fitting recognition model for subsequent text recognition; it also allows us to reason about the semantic content of a given document page without the need for fine-grained text recognition and further analysis steps, such as Named Entity Recognition. We show that our proposed approaches work well when trained on synthetic data. Further, we propose a flexible metric learning approach to allow zero-shot classification of classes unseen during the network's training. Last, we propose a novel data synthesis algorithm to train off-the-shelf pixel-wise semantic segmentation networks for documents. Our data synthesis pipeline is based on the famous Style-GAN architecture and can synthesize realistic document images with their corresponding segmentation annotation without the need for any annotated data!}, language = {en} } @phdthesis{BinTareaf2022, author = {Bin Tareaf, Raad}, title = {Social media based personality prediction models}, doi = {10.25932/publishup-54914}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-549142}, school = {Universit{\"a}t Potsdam}, pages = {x, 137}, year = {2022}, abstract = {Individuals have an intrinsic need to express themselves to other humans within a given community by sharing their experiences, thoughts, actions, and opinions. As a means, they mostly prefer to use modern online social media platforms such as Twitter, Facebook, personal blogs, and Reddit. Users of these social networks interact by drafting their own statuses updates, publishing photos, and giving likes leaving a considerable amount of data behind them to be analyzed. Researchers recently started exploring the shared social media data to understand online users better and predict their Big five personality traits: agreeableness, conscientiousness, extraversion, neuroticism, and openness to experience. This thesis intends to investigate the possible relationship between users' Big five personality traits and the published information on their social media profiles. Facebook public data such as linguistic status updates, meta-data of likes objects, profile pictures, emotions, or reactions records were adopted to address the proposed research questions. Several machine learning predictions models were constructed with various experiments to utilize the engineered features correlated with the Big 5 Personality traits. The final predictive performances improved the prediction accuracy compared to state-of-the-art approaches, and the models were evaluated based on established benchmarks in the domain. The research experiments were implemented while ethical and privacy points were concerned. Furthermore, the research aims to raise awareness about privacy between social media users and show what third parties can reveal about users' private traits from what they share and act on different social networking platforms. In the second part of the thesis, the variation in personality development is studied within a cross-platform environment such as Facebook and Twitter platforms. The constructed personality profiles in these social platforms are compared to evaluate the effect of the used platforms on one user's personality development. Likewise, personality continuity and stability analysis are performed using two social media platforms samples. The implemented experiments are based on ten-year longitudinal samples aiming to understand users' long-term personality development and further unlock the potential of cooperation between psychologists and data scientists.}, language = {en} } @article{BlaesiusFriedrichLischeidetal.2022, author = {Bl{\"a}sius, Thomas and Friedrich, Tobias and Lischeid, Julius and Meeks, Kitty and Schirneck, Friedrich Martin}, title = {Efficiently enumerating hitting sets of hypergraphs arising in data profiling}, series = {Journal of computer and system sciences : JCSS}, volume = {124}, journal = {Journal of computer and system sciences : JCSS}, publisher = {Elsevier}, address = {San Diego}, issn = {0022-0000}, doi = {10.1016/j.jcss.2021.10.002}, pages = {192 -- 213}, year = {2022}, abstract = {The transversal hypergraph problem asks to enumerate the minimal hitting sets of a hypergraph. If the solutions have bounded size, Eiter and Gottlob [SICOMP'95] gave an algorithm running in output-polynomial time, but whose space requirement also scales with the output. We improve this to polynomial delay and space. Central to our approach is the extension problem, deciding for a set X of vertices whether it is contained in any minimal hitting set. We show that this is one of the first natural problems to be W[3]-complete. We give an algorithm for the extension problem running in time O(m(vertical bar X vertical bar+1) n) and prove a SETH-lower bound showing that this is close to optimal. We apply our enumeration method to the discovery problem of minimal unique column combinations from data profiling. Our empirical evaluation suggests that the algorithm outperforms its worst-case guarantees on hypergraphs stemming from real-world databases.}, language = {en} } @article{CaselFernauGhadikolaeietal.2022, author = {Casel, Katrin and Fernau, Henning and Ghadikolaei, Mehdi Khosravian and Monnot, Jerome and Sikora, Florian}, title = {On the complexity of solution extension of optimization problems}, series = {Theoretical computer science : the journal of the EATCS}, volume = {904}, journal = {Theoretical computer science : the journal of the EATCS}, publisher = {Elsevier}, address = {Amsterdam [u.a.]}, issn = {0304-3975}, doi = {10.1016/j.tcs.2021.10.017}, pages = {48 -- 65}, year = {2022}, abstract = {The question if a given partial solution to a problem can be extended reasonably occurs in many algorithmic approaches for optimization problems. For instance, when enumerating minimal vertex covers of a graph G = (V, E), one usually arrives at the problem to decide for a vertex set U subset of V (pre-solution), if there exists a minimal vertex cover S (i.e., a vertex cover S subset of V such that no proper subset of S is a vertex cover) with U subset of S (minimal extension of U). We propose a general, partial-order based formulation of such extension problems which allows to model parameterization and approximation aspects of extension, and also highlights relationships between extension tasks for different specific problems. As examples, we study a number of specific problems which can be expressed and related in this framework. In particular, we discuss extension variants of the problems dominating set and feedback vertex/edge set. All these problems are shown to be NP-complete even when restricted to bipartite graphs of bounded degree, with the exception of our extension version of feedback edge set on undirected graphs which is shown to be solvable in polynomial time. For the extension variants of dominating and feedback vertex set, we also show NP-completeness for the restriction to planar graphs of bounded degree. As non-graph problem, we also study an extension version of the bin packing problem. We further consider the parameterized complexity of all these extension variants, where the parameter is a measure of the pre-solution as defined by our framework.}, language = {en} } @article{CaselFischbeckFriedrichetal.2022, author = {Casel, Katrin and Fischbeck, Philipp and Friedrich, Tobias and G{\"o}bel, Andreas and Lagodzinski, J. A. Gregor}, title = {Zeros and approximations of Holant polynomials on the complex plane}, series = {Computational complexity : CC}, volume = {31}, journal = {Computational complexity : CC}, number = {2}, publisher = {Springer}, address = {Basel}, issn = {1016-3328}, doi = {10.1007/s00037-022-00226-5}, pages = {52}, year = {2022}, abstract = {We present fully polynomial time approximation schemes for a broad class of Holant problems with complex edge weights, which we call Holant polynomials. We transform these problems into partition functions of abstract combinatorial structures known as polymers in statistical physics. Our method involves establishing zero-free regions for the partition functions of polymer models and using the most significant terms of the cluster expansion to approximate them. Results of our technique include new approximation and sampling algorithms for a diverse class of Holant polynomials in the low-temperature regime (i.e. small external field) and approximation algorithms for general Holant problems with small signature weights. Additionally, we give randomised approximation and sampling algorithms with faster running times for more restrictive classes. Finally, we improve the known zero-free regions for a perfect matching polynomial.}, language = {en} } @article{ChandranIssacLaurietal.2022, author = {Chandran, Sunil L. and Issac, Davis and Lauri, Juho and van Leeuwen, Erik Jan}, title = {Upper bounding rainbow connection number by forest number}, series = {Discrete mathematics}, volume = {345}, journal = {Discrete mathematics}, number = {7}, publisher = {Elsevier}, address = {Amsterdam [u.a.]}, issn = {0012-365X}, doi = {10.1016/j.disc.2022.112829}, pages = {22}, year = {2022}, abstract = {A path in an edge-colored graph is rainbow if no two edges of it are colored the same, and the graph is rainbow-connected if there is a rainbow path between each pair of its vertices. The minimum number of colors needed to rainbow-connect a graph G is the rainbow connection number of G, denoted by rc(G).\& nbsp;A simple way to rainbow-connect a graph G is to color the edges of a spanning tree with distinct colors and then re-use any of these colors to color the remaining edges of G. This proves that rc(G) <= |V (G)|-1. We ask whether there is a stronger connection between tree-like structures and rainbow coloring than that is implied by the above trivial argument. For instance, is it possible to find an upper bound of t(G)-1 for rc(G), where t(G) is the number of vertices in the largest induced tree of G? The answer turns out to be negative, as there are counter-examples that show that even c .t(G) is not an upper bound for rc(G) for any given constant c.\& nbsp;In this work we show that if we consider the forest number f(G), the number of vertices in a maximum induced forest of G, instead of t(G), then surprisingly we do get an upper bound. More specifically, we prove that rc(G) <= f(G) + 2. Our result indicates a stronger connection between rainbow connection and tree-like structures than that was suggested by the simple spanning tree based upper bound.}, language = {en} } @article{CoupetteHartungBeckedorfetal.2022, author = {Coupette, Corinna and Hartung, Dirk and Beckedorf, Janis and B{\"o}ther, Maximilian and Katz, Daniel Martin}, title = {Law smells}, series = {Artificial intelligence and law}, volume = {31}, journal = {Artificial intelligence and law}, publisher = {Springer}, address = {Dordrecht}, issn = {0924-8463}, doi = {10.1007/s10506-022-09315-w}, pages = {335 -- 368}, year = {2022}, abstract = {Building on the computer science concept of code smells, we initiate the study of law smells, i.e., patterns in legal texts that pose threats to the comprehensibility and maintainability of the law. With five intuitive law smells as running examples-namely, duplicated phrase, long element, large reference tree, ambiguous syntax, and natural language obsession-, we develop a comprehensive law smell taxonomy. This taxonomy classifies law smells by when they can be detected, which aspects of law they relate to, and how they can be discovered. We introduce text-based and graph-based methods to identify instances of law smells, confirming their utility in practice using the United States Code as a test case. Our work demonstrates how ideas from software engineering can be leveraged to assess and improve the quality of legal code, thus drawing attention to an understudied area in the intersection of law and computer science and highlighting the potential of computational legal drafting.}, language = {en} } @article{dePaulaMarxWolfetal.2022, author = {de Paula, Danielly and Marx, Carolin and Wolf, Ella and Dremel, Christian and Cormican, Kathryn and Uebernickel, Falk}, title = {A managerial mental model to drive innovation in the context of digital transformation}, series = {Industry and innovation}, journal = {Industry and innovation}, publisher = {Routledge, Taylor \& Francis Group}, address = {Abingdon}, issn = {1366-2716}, doi = {10.1080/13662716.2022.2072711}, pages = {24}, year = {2022}, abstract = {Industry 4.0 is transforming how businesses innovate and, as a result, companies are spearheading the movement towards 'Digital Transformation'. While some scholars advocate the use of design thinking to identify new innovative behaviours, cognition experts emphasise the importance of top managers in supporting employees to develop these behaviours. However, there is a dearth of research in this domain and companies are struggling to implement the required behaviours. To address this gap, this study aims to identify and prioritise behavioural strategies conducive to design thinking to inform the creation of a managerial mental model. We identify 20 behavioural strategies from 45 interviewees with practitioners and educators and combine them with the concepts of 'paradigm-mindset-mental model' from cognition theory. The paper contributes to the body of knowledge by identifying and prioritising specific behavioural strategies to form a novel set of survival conditions aligned to the new industrial paradigm of Industry 4.0.}, language = {en} } @phdthesis{Draisbach2022, author = {Draisbach, Uwe}, title = {Efficient duplicate detection and the impact of transitivity}, doi = {10.25932/publishup-57214}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-572140}, school = {Universit{\"a}t Potsdam}, pages = {x, 150}, year = {2022}, abstract = {Duplicate detection describes the process of finding multiple representations of the same real-world entity in the absence of a unique identifier, and has many application areas, such as customer relationship management, genealogy and social sciences, or online shopping. Due to the increasing amount of data in recent years, the problem has become even more challenging on the one hand, but has led to a renaissance in duplicate detection research on the other hand. This thesis examines the effects and opportunities of transitive relationships on the duplicate detection process. Transitivity implies that if record pairs ⟨ri,rj⟩ and ⟨rj,rk⟩ are classified as duplicates, then also record pair ⟨ri,rk⟩ has to be a duplicate. However, this reasoning might contradict with the pairwise classification, which is usually based on the similarity of objects. An essential property of similarity, in contrast to equivalence, is that similarity is not necessarily transitive. First, we experimentally evaluate the effect of an increasing data volume on the threshold selection to classify whether a record pair is a duplicate or non-duplicate. Our experiments show that independently of the pair selection algorithm and the used similarity measure, selecting a suitable threshold becomes more difficult with an increasing number of records due to an increased probability of adding a false duplicate to an existing cluster. Thus, the best threshold changes with the dataset size, and a good threshold for a small (possibly sampled) dataset is not necessarily a good threshold for a larger (possibly complete) dataset. As data grows over time, earlier selected thresholds are no longer a suitable choice, and the problem becomes worse for datasets with larger clusters. Second, we present with the Duplicate Count Strategy (DCS) and its enhancement DCS++ two alternatives to the standard Sorted Neighborhood Method (SNM) for the selection of candidate record pairs. DCS adapts SNMs window size based on the number of detected duplicates and DCS++ uses transitive dependencies to save complex comparisons for finding duplicates in larger clusters. We prove that with a proper (domain- and data-independent!) threshold, DCS++ is more efficient than SNM without loss of effectiveness. Third, we tackle the problem of contradicting pairwise classifications. Usually, the transitive closure is used for pairwise classifications to obtain a transitively closed result set. However, the transitive closure disregards negative classifications. We present three new and several existing clustering algorithms and experimentally evaluate them on various datasets and under various algorithm configurations. The results show that the commonly used transitive closure is inferior to most other clustering algorithms, especially for the precision of results. In scenarios with larger clusters, our proposed EMCC algorithm is, together with Markov Clustering, the best performing clustering approach for duplicate detection, although its runtime is longer than Markov Clustering due to the subexponential time complexity. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.}, language = {en} } @techreport{DoellnerFriedrichArnrichetal.2022, author = {D{\"o}llner, J{\"u}rgen Roland Friedrich and Friedrich, Tobias and Arnrich, Bert and Hirschfeld, Robert and Lippert, Christoph and Meinel, Christoph}, title = {Abschlussbericht KI-Labor ITSE}, doi = {10.25932/publishup-57860}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-578604}, pages = {60}, year = {2022}, abstract = {Der Abschlussbericht beschreibt Aufgaben und Ergebnisse des KI-Labors "ITSE". Gegenstand des KI-Labors bildeten Methodik, Technik und Ausbildung in der IT-Systemtechnik zur Analyse, Planung und Konstruktion KI-basierter, komplexer IT-Systeme.}, language = {de} } @book{DuerschReinMattisetal.2022, author = {D{\"u}rsch, Falco and Rein, Patrick and Mattis, Toni and Hirschfeld, Robert}, title = {Learning from failure}, number = {145}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-528-6}, issn = {1613-5652}, doi = {10.25932/publishup-53755}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-537554}, publisher = {Universit{\"a}t Potsdam}, pages = {87}, year = {2022}, abstract = {Regression testing is a widespread practice in today's software industry to ensure software product quality. Developers derive a set of test cases, and execute them frequently to ensure that their change did not adversely affect existing functionality. As the software product and its test suite grow, the time to feedback during regression test sessions increases, and impedes programmer productivity: developers wait longer for tests to complete, and delays in fault detection render fault removal increasingly difficult. Test case prioritization addresses the problem of long feedback loops by reordering test cases, such that test cases of high failure probability run first, and test case failures become actionable early in the testing process. We ask, given test execution schedules reconstructed from publicly available data, to which extent can their fault detection efficiency improved, and which technique yields the most efficient test schedules with respect to APFD? To this end, we recover regression 6200 test sessions from the build log files of Travis CI, a popular continuous integration service, and gather 62000 accompanying changelists. We evaluate the efficiency of current test schedules, and examine the prioritization results of state-of-the-art lightweight, history-based heuristics. We propose and evaluate a novel set of prioritization algorithms, which connect software changes and test failures in a matrix-like data structure. Our studies indicate that the optimization potential is substantial, because the existing test plans score only 30\% APFD. The predictive power of past test failures proves to be outstanding: simple heuristics, such as repeating tests with failures in recent sessions, result in efficiency scores of 95\% APFD. The best-performing matrix-based heuristic achieves a similar score of 92.5\% APFD. In contrast to prior approaches, we argue that matrix-based techniques are useful beyond the scope of effective prioritization, and enable a number of use cases involving software maintenance. We validate our findings from continuous integration processes by extending a continuous testing tool within development environments with means of test prioritization, and pose further research questions. We think that our findings are suited to propel adoption of (continuous) testing practices, and that programmers' toolboxes should contain test prioritization as an existential productivity tool.}, language = {en} } @book{EichenrothReinHirschfeld2022, author = {Eichenroth, Friedrich and Rein, Patrick and Hirschfeld, Robert}, title = {Fast packrat parsing in a live programming environment}, series = {Technische Berichte des Hasso-Plattner-Instituts f{\"u}r Digital Engineering an der Universit{\"a}t Potsdam}, journal = {Technische Berichte des Hasso-Plattner-Instituts f{\"u}r Digital Engineering an der Universit{\"a}t Potsdam}, number = {135}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-503-3}, issn = {1613-5652}, doi = {10.25932/publishup-49124}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-491242}, publisher = {Universit{\"a}t Potsdam}, pages = {79}, year = {2022}, abstract = {Language developers who design domain-specific languages or new language features need a way to make fast changes to language definitions. Those fast changes require immediate feedback. Also, it should be possible to parse the developed languages quickly to handle extensive sets of code. Parsing expression grammars provides an easy to understand method for language definitions. Packrat parsing is a method to parse grammars of this kind, but this method is unable to handle left-recursion properly. Existing solutions either partially rewrite left-recursive rules and partly forbid them, or use complex extensions to packrat parsing that are hard to understand and cost-intensive. We investigated methods to make parsing as fast as possible, using easy to follow algorithms while not losing the ability to make fast changes to grammars. We focused our efforts on two approaches. One is to start from an existing technique for limited left-recursion rewriting and enhance it to work for general left-recursive grammars. The second approach is to design a grammar compilation process to find left-recursion before parsing, and in this way, reduce computational costs wherever possible and generate ready to use parser classes. Rewriting parsing expression grammars is a task that, if done in a general way, unveils a large number of cases such that any rewriting algorithm surpasses the complexity of other left-recursive parsing algorithms. Lookahead operators introduce this complexity. However, most languages have only little portions that are left-recursive and in virtually all cases, have no indirect or hidden left-recursion. This means that the distinction of left-recursive parts of grammars from components that are non-left-recursive holds great improvement potential for existing parsers. In this report, we list all the required steps for grammar rewriting to handle left-recursion, including grammar analysis, grammar rewriting itself, and syntax tree restructuring. Also, we describe the implementation of a parsing expression grammar framework in Squeak/Smalltalk and the possible interactions with the already existing parser Ohm/S. We quantitatively benchmarked this framework directing our focus on parsing time and the ability to use it in a live programming context. Compared with Ohm, we achieved massive parsing time improvements while preserving the ability to use our parser it as a live programming tool. The work is essential because, for one, we outlined the difficulties and complexity that come with grammar rewriting. Also, we removed the existing limitations that came with left-recursion by eliminating them before parsing.}, language = {en} } @phdthesis{Elsaid2022, author = {Elsaid, Mohamed Esameldin Mohamed}, title = {Virtual machines live migration cost modeling and prediction}, doi = {10.25932/publishup-54001}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-540013}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 107}, year = {2022}, abstract = {Dynamic resource management is an essential requirement for private and public cloud computing environments. With dynamic resource management, the physical resources assignment to the cloud virtual resources depends on the actual need of the applications or the running services, which enhances the cloud physical resources utilization and reduces the offered services cost. In addition, the virtual resources can be moved across different physical resources in the cloud environment without an obvious impact on the running applications or services production. This means that the availability of the running services and applications in the cloud is independent on the hardware resources including the servers, switches and storage failures. This increases the reliability of using cloud services compared to the classical data-centers environments. In this thesis we briefly discuss the dynamic resource management topic and then deeply focus on live migration as the definition of the compute resource dynamic management. Live migration is a commonly used and an essential feature in cloud and virtual data-centers environments. Cloud computing load balance, power saving and fault tolerance features are all dependent on live migration to optimize the virtual and physical resources usage. As we will discuss in this thesis, live migration shows many benefits to cloud and virtual data-centers environments, however the cost of live migration can not be ignored. Live migration cost includes the migration time, downtime, network overhead, power consumption increases and CPU overhead. IT admins run virtual machines live migrations without an idea about the migration cost. So, resources bottlenecks, higher migration cost and migration failures might happen. The first problem that we discuss in this thesis is how to model the cost of the virtual machines live migration. Secondly, we investigate how to make use of machine learning techniques to help the cloud admins getting an estimation of this cost before initiating the migration for one of multiple virtual machines. Also, we discuss the optimal timing for a specific virtual machine before live migration to another server. Finally, we propose practical solutions that can be used by the cloud admins to be integrated with the cloud administration portals to answer the raised research questions above. Our research methodology to achieve the project objectives is to propose empirical models based on using VMware test-beds with different benchmarks tools. Then we make use of the machine learning techniques to propose a prediction approach for virtual machines live migration cost. Timing optimization for live migration is also proposed in this thesis based on using the cost prediction and data-centers network utilization prediction. Live migration with persistent memory clusters is also discussed at the end of the thesis. The cost prediction and timing optimization techniques proposed in this thesis could be practically integrated with VMware vSphere cluster portal such that the IT admins can now use the cost prediction feature and timing optimization option before proceeding with a virtual machine live migration. Testing results show that our proposed approach for VMs live migration cost prediction shows acceptable results with less than 20\% prediction error and can be easily implemented and integrated with VMware vSphere as an example of a commonly used resource management portal for virtual data-centers and private cloud environments. The results show that using our proposed VMs migration timing optimization technique also could save up to 51\% of migration time of the VMs migration time for memory intensive workloads and up to 27\% of the migration time for network intensive workloads. This timing optimization technique can be useful for network admins to save migration time with utilizing higher network rate and higher probability of success. At the end of this thesis, we discuss the persistent memory technology as a new trend in servers memory technology. Persistent memory modes of operation and configurations are discussed in detail to explain how live migration works between servers with different memory configuration set up. Then, we build a VMware cluster with persistent memory inside server and also with DRAM only servers to show the live migration cost difference between the VMs with DRAM only versus the VMs with persistent memory inside.}, language = {en} } @article{FehrJaramilloGutierrezOalaetal.2022, author = {Fehr, Jana and Jaramillo-Gutierrez, Giovanna and Oala, Luis and Gr{\"o}schel, Matthias I. and Bierwirth, Manuel and Balachandran, Pradeep and Werneck-Leite, Alixandro and Lippert, Christoph}, title = {Piloting a Survey-Based Assessment of Transparency and Trustworthiness with Three Medical AI Tools}, series = {Healthcare}, volume = {10}, journal = {Healthcare}, number = {10}, publisher = {MDPI}, address = {Basel, Schweiz}, issn = {2227-9032}, doi = {10.3390/healthcare10101923}, pages = {30}, year = {2022}, abstract = {Artificial intelligence (AI) offers the potential to support healthcare delivery, but poorly trained or validated algorithms bear risks of harm. Ethical guidelines stated transparency about model development and validation as a requirement for trustworthy AI. Abundant guidance exists to provide transparency through reporting, but poorly reported medical AI tools are common. To close this transparency gap, we developed and piloted a framework to quantify the transparency of medical AI tools with three use cases. Our framework comprises a survey to report on the intended use, training and validation data and processes, ethical considerations, and deployment recommendations. The transparency of each response was scored with either 0, 0.5, or 1 to reflect if the requested information was not, partially, or fully provided. Additionally, we assessed on an analogous three-point scale if the provided responses fulfilled the transparency requirement for a set of trustworthiness criteria from ethical guidelines. The degree of transparency and trustworthiness was calculated on a scale from 0\% to 100\%. Our assessment of three medical AI use cases pin-pointed reporting gaps and resulted in transparency scores of 67\% for two use cases and one with 59\%. We report anecdotal evidence that business constraints and limited information from external datasets were major obstacles to providing transparency for the three use cases. The observed transparency gaps also lowered the degree of trustworthiness, indicating compliance gaps with ethical guidelines. All three pilot use cases faced challenges to provide transparency about medical AI tools, but more studies are needed to investigate those in the wider medical AI sector. Applying this framework for an external assessment of transparency may be infeasible if business constraints prevent the disclosure of information. New strategies may be necessary to enable audits of medical AI tools while preserving business secrets.}, language = {en} } @misc{FehrJaramilloGutierrezOalaetal.2022, author = {Fehr, Jana and Jaramillo-Gutierrez, Giovanna and Oala, Luis and Gr{\"o}schel, Matthias I. and Bierwirth, Manuel and Balachandran, Pradeep and Werneck-Leite, Alixandro and Lippert, Christoph}, title = {Piloting a Survey-Based Assessment of Transparency and Trustworthiness with Three Medical AI Tools}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, number = {15}, doi = {10.25932/publishup-58328}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-583281}, pages = {30}, year = {2022}, abstract = {Artificial intelligence (AI) offers the potential to support healthcare delivery, but poorly trained or validated algorithms bear risks of harm. Ethical guidelines stated transparency about model development and validation as a requirement for trustworthy AI. Abundant guidance exists to provide transparency through reporting, but poorly reported medical AI tools are common. To close this transparency gap, we developed and piloted a framework to quantify the transparency of medical AI tools with three use cases. Our framework comprises a survey to report on the intended use, training and validation data and processes, ethical considerations, and deployment recommendations. The transparency of each response was scored with either 0, 0.5, or 1 to reflect if the requested information was not, partially, or fully provided. Additionally, we assessed on an analogous three-point scale if the provided responses fulfilled the transparency requirement for a set of trustworthiness criteria from ethical guidelines. The degree of transparency and trustworthiness was calculated on a scale from 0\% to 100\%. Our assessment of three medical AI use cases pin-pointed reporting gaps and resulted in transparency scores of 67\% for two use cases and one with 59\%. We report anecdotal evidence that business constraints and limited information from external datasets were major obstacles to providing transparency for the three use cases. The observed transparency gaps also lowered the degree of trustworthiness, indicating compliance gaps with ethical guidelines. All three pilot use cases faced challenges to provide transparency about medical AI tools, but more studies are needed to investigate those in the wider medical AI sector. Applying this framework for an external assessment of transparency may be infeasible if business constraints prevent the disclosure of information. New strategies may be necessary to enable audits of medical AI tools while preserving business secrets.}, language = {en} } @book{FlottererMaximovaSchneideretal.2022, author = {Flotterer, Boris and Maximova, Maria and Schneider, Sven and Dyck, Johannes and Z{\"o}llner, Christian and Giese, Holger and H{\´e}ly, Christelle and Gaucherel, C{\´e}dric}, title = {Modeling and Formal Analysis of Meta-Ecosystems with Dynamic Structure using Graph Transformation}, series = {Technische Berichte des Hasso-Plattner-Instituts f{\"u}r Digital Engineering an der Universit{\"a}t Potsdam}, journal = {Technische Berichte des Hasso-Plattner-Instituts f{\"u}r Digital Engineering an der Universit{\"a}t Potsdam}, number = {147}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-533-0}, issn = {1613-5652}, doi = {10.25932/publishup-54764}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-547643}, publisher = {Universit{\"a}t Potsdam}, pages = {47}, year = {2022}, abstract = {The dynamics of ecosystems is of crucial importance. Various model-based approaches exist to understand and analyze their internal effects. In this paper, we model the space structure dynamics and ecological dynamics of meta-ecosystems using the formal technique of Graph Transformation (short GT). We build GT models to describe how a meta-ecosystem (modeled as a graph) can evolve over time (modeled by GT rules) and to analyze these GT models with respect to qualitative properties such as the existence of structural stabilities. As a case study, we build three GT models describing the space structure dynamics and ecological dynamics of three different savanna meta-ecosystems. The first GT model considers a savanna meta-ecosystem that is limited in space to two ecosystem patches, whereas the other two GT models consider two savanna meta-ecosystems that are unlimited in the number of ecosystem patches and only differ in one GT rule describing how the space structure of the meta-ecosystem grows. In the first two GT models, the space structure dynamics and ecological dynamics of the meta-ecosystem shows two main structural stabilities: the first one based on grassland-savanna-woodland transitions and the second one based on grassland-desert transitions. The transition between these two structural stabilities is driven by high-intensity fires affecting the tree components. In the third GT model, the GT rule for savanna regeneration induces desertification and therefore a collapse of the meta-ecosystem. We believe that GT models provide a complementary avenue to that of existing approaches to rigorously study ecological phenomena.}, language = {en} } @book{FreundRaetschHradilaketal.2022, author = {Freund, Rieke and R{\"a}tsch, Jan Philip and Hradilak, Franziska and Vidic, Benedikt and Heß, Oliver and Lißner, Nils and W{\"o}lert, Hendrik and Lincke, Jens and Beckmann, Tom and Hirschfeld, Robert}, title = {Implementing a crowd-sourced picture archive for Bad Harzburg}, number = {149}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-545-3}, issn = {1613-5652}, doi = {10.25932/publishup-56029}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-560291}, publisher = {Universit{\"a}t Potsdam}, pages = {x, 191}, year = {2022}, abstract = {Pictures are a medium that helps make the past tangible and preserve memories. Without context, they are not able to do so. Pictures are brought to life by their associated stories. However, the older pictures become, the fewer contemporary witnesses can tell these stories. Especially for large, analog picture archives, knowledge and memories are spread over many people. This creates several challenges: First, the pictures must be digitized to save them from decaying and make them available to the public. Since a simple listing of all the pictures is confusing, the pictures should be structured accessibly. Second, known information that makes the stories vivid needs to be added to the pictures. Users should get the opportunity to contribute their knowledge and memories. To make this usable for all interested parties, even for older, less technophile generations, the interface should be intuitive and error-tolerant. The resulting requirements are not covered in their entirety by any existing software solution without losing the intuitive interface or the scalability of the system. Therefore, we have developed our digital picture archive within the scope of a bachelor project in cooperation with the Bad Harzburg-Stiftung. For the implementation of this web application, we use the UI framework React in the frontend, which communicates via a GraphQL interface with the Content Management System Strapi in the backend. The use of this system enables our project partner to create an efficient process from scanning analog pictures to presenting them to visitors in an organized and annotated way. To customize the solution for both picture delivery and information contribution for our target group, we designed prototypes and evaluated them with people from Bad Harzburg. This helped us gain valuable insights into our system's usability and future challenges as well as requirements. Our web application is already being used daily by our project partner. During the project, we still came up with numerous ideas for additional features to further support the exchange of knowledge.}, language = {en} } @article{GenskeJahnke2022, author = {Genske, Ulrich and Jahnke, Paul}, title = {Human observer net}, series = {Radiology}, volume = {303}, journal = {Radiology}, number = {3}, publisher = {Radiologgical soc North America (RSNA)}, address = {Oak brook}, issn = {0033-8419}, doi = {10.1148/radiol.211832}, pages = {524 -- 530}, year = {2022}, abstract = {Background: Current software applications for human observer studies of images lack flexibility in study design, platform independence, multicenter use, and assessment methods and are not open source, limiting accessibility and expandability. Purpose: To develop a user-friendly software platform that enables efficient human observer studies in medical imaging with flexibility of study design. Materials and Methods: Software for human observer imaging studies was designed as an open-source web application to facilitate access, platform-independent usability, and multicenter studies. Different interfaces for study creation, participation, and management of results were implemented. The software was evaluated in human observer experiments between May 2019 and March 2021, in which duration of observer responses was tracked. Fourteen radiologists evaluated and graded software usability using the 100-point system usability scale. The application was tested in Chrome, Firefox, Safari, and Edge browsers. Results: Software function was designed to allow visual grading analysis (VGA), multiple-alternative forced-choice (m-AFC), receiver operating characteristic (ROC), localization ROC, free-response ROC, and customized designs. The mean duration of reader responses per image or per image set was 6.2 seconds 6 4.8 (standard deviation), 5.8 seconds 6 4.7, 8.7 seconds 6 5.7, and 6.0 seconds 6 4.5 in four-AFC with 160 image quartets per reader, four-AFC with 640 image quartets per reader, localization ROC, and experimental studies, respectively. The mean system usability scale score was 83 6 11 (out of 100). The documented code and a demonstration of the application are available online (https://github.com/genskeu/HON, https://hondemo.pythonanywhere.com/). Conclusion: A user-friendly and efficient open-source application was developed for human reader experiments that enables study design versatility, as well as platform-independent and multicenter usability.}, language = {en} } @book{GerkenUebernickeldePaula2022, author = {Gerken, Stefanie and Uebernickel, Falk and de Paula, Danielly}, title = {Design Thinking: a Global Study on Implementation Practices in Organizations}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-525-5}, doi = {10.25932/publishup-53466}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-534668}, publisher = {Universit{\"a}t Potsdam}, pages = {230}, year = {2022}, abstract = {These days design thinking is no longer a "new approach". Among practitioners, as well as academics, interest in the topic has gathered pace over the last two decades. However, opinions are divided over the longevity of the phenomenon: whether design thinking is merely "old wine in new bottles," a passing trend, or still evolving as it is being spread to an increasing number of organizations and industries. Despite its growing relevance and the diffusion of design thinking, knowledge on the actual status quo in organizations remains scarce. With a new study, the research team of Prof. Uebernickel and Stefanie Gerken investigates temporal developments and changes in design thinking practices in organizations over the past six years comparing the results of the 2015 "Parts without a whole" study with current practices and future developments. Companies of all sizes and from different parts of the world participated in the survey. The findings from qualitative interviews with experts, i.e., people who have years of knowledge with design thinking, were cross-checked with the results from an exploratory analysis of the survey data. This analysis uncovers significant variances and similarities in how design thinking is interpreted and applied in businesses.}, language = {en} } @phdthesis{Gruener2022, author = {Gr{\"u}ner, Andreas}, title = {Towards practical and trust-enhancing attribute aggregation for self-sovereign identity}, doi = {10.25932/publishup-56745}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-567450}, school = {Universit{\"a}t Potsdam}, pages = {xvii, 175}, year = {2022}, abstract = {Identity management is at the forefront of applications' security posture. It separates the unauthorised user from the legitimate individual. Identity management models have evolved from the isolated to the centralised paradigm and identity federations. Within this advancement, the identity provider emerged as a trusted third party that holds a powerful position. Allen postulated the novel self-sovereign identity paradigm to establish a new balance. Thus, extensive research is required to comprehend its virtues and limitations. Analysing the new paradigm, initially, we investigate the blockchain-based self-sovereign identity concept structurally. Moreover, we examine trust requirements in this context by reference to patterns. These shapes comprise major entities linked by a decentralised identity provider. By comparison to the traditional models, we conclude that trust in credential management and authentication is removed. Trust-enhancing attribute aggregation based on multiple attribute providers provokes a further trust shift. Subsequently, we formalise attribute assurance trust modelling by a metaframework. It encompasses the attestation and trust network as well as the trust decision process, including the trust function, as central components. A secure attribute assurance trust model depends on the security of the trust function. The trust function should consider high trust values and several attribute authorities. Furthermore, we evaluate classification, conceptual study, practical analysis and simulation as assessment strategies of trust models. For realising trust-enhancing attribute aggregation, we propose a probabilistic approach. The method exerts the principle characteristics of correctness and validity. These values are combined for one provider and subsequently for multiple issuers. We embed this trust function in a model within the self-sovereign identity ecosystem. To practically apply the trust function and solve several challenges for the service provider that arise from adopting self-sovereign identity solutions, we conceptualise and implement an identity broker. The mediator applies a component-based architecture to abstract from a single solution. Standard identity and access management protocols build the interface for applications. We can conclude that the broker's usage at the side of the service provider does not undermine self-sovereign principles, but fosters the advancement of the ecosystem. The identity broker is applied to sample web applications with distinct attribute requirements to showcase usefulness for authentication and attribute-based access control within a case study.}, language = {en} }