@phdthesis{Vitagliano2024, author = {Vitagliano, Gerardo}, title = {Modeling the structure of tabular files for data preparation}, doi = {10.25932/publishup-62435}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624351}, school = {Universit{\"a}t Potsdam}, pages = {ii, 114}, year = {2024}, abstract = {To manage tabular data files and leverage their content in a given downstream task, practitioners often design and execute complex transformation pipelines to prepare them. The complexity of such pipelines stems from different factors, including the nature of the preparation tasks, often exploratory or ad-hoc to specific datasets; the large repertory of tools, algorithms, and frameworks that practitioners need to master; and the volume, variety, and velocity of the files to be prepared. Metadata plays a fundamental role in reducing this complexity: characterizing a file assists end users in the design of data preprocessing pipelines, and furthermore paves the way for suggestion, automation, and optimization of data preparation tasks. Previous research in the areas of data profiling, data integration, and data cleaning, has focused on extracting and characterizing metadata regarding the content of tabular data files, i.e., about the records and attributes of tables. Content metadata are useful for the latter stages of a preprocessing pipeline, e.g., error correction, duplicate detection, or value normalization, but they require a properly formed tabular input. Therefore, these metadata are not relevant for the early stages of a preparation pipeline, i.e., to correctly parse tables out of files. In this dissertation, we turn our focus to what we call the structure of a tabular data file, i.e., the set of characters within a file that do not represent data values but are required to parse and understand the content of the file. We provide three different approaches to represent file structure, an explicit representation based on context-free grammars; an implicit representation based on file-wise similarity; and a learned representation based on machine learning. In our first contribution, we use the grammar-based representation to characterize a set of over 3000 real-world csv files and identify multiple structural issues that let files deviate from the csv standard, e.g., by having inconsistent delimiters or containing multiple tables. We leverage our learnings about real-world files and propose Pollock, a benchmark to test how well systems parse csv files that have a non-standard structure, without any previous preparation. We report on our experiments on using Pollock to evaluate the performance of 16 real-world data management systems. Following, we characterize the structure of files implicitly, by defining a measure of structural similarity for file pairs. We design a novel algorithm to compute this measure, which is based on a graph representation of the files' content. We leverage this algorithm and propose Mondrian, a graphical system to assist users in identifying layout templates in a dataset, classes of files that have the same structure, and therefore can be prepared by applying the same preparation pipeline. Finally, we introduce MaGRiTTE, a novel architecture that uses self-supervised learning to automatically learn structural representations of files in the form of vectorial embeddings at three different levels: cell level, row level, and file level. We experiment with the application of structural embeddings for several tasks, namely dialect detection, row classification, and data preparation efforts estimation. Our experimental results show that structural metadata, either identified explicitly on parsing grammars, derived implicitly as file-wise similarity, or learned with the help of machine learning architectures, is fundamental to automate several tasks, to scale up preparation to large quantities of files, and to provide repeatable preparation pipelines.}, language = {en} } @article{KuehlerDrathschmidtGrossmann2024, author = {K{\"u}hler, Jakob and Drathschmidt, Nicolas and Großmann, Daniela}, title = {'Modern talking'}, series = {Information polity}, volume = {29}, journal = {Information polity}, number = {2}, publisher = {IOS Press}, address = {Amsterdam}, issn = {1570-1255}, doi = {10.3233/IP-230059}, pages = {199 -- 216}, year = {2024}, abstract = {Despite growing interest, we lack a clear understanding of how the arguably ambiguous phenomenon of agile is perceived in government practice. This study aims to alleviate this puzzle by investigating how managers and employees in German public sector organisations make sense of agile as a spreading management fashion in the form of narratives. This is important because narratives function as innovation carriers that ultimately influence the manifestations of the concept in organisations. Based on a multi-case study of 31 interviews and 24 responses to a qualitative online survey conducted in 2021 and 2022, we provide insights into what public sector managers, employees and consultants understand (and, more importantly, do not understand) as agile and how they weave it into their existing reality of bureaucratic organisations. We uncover three meta-narratives of agile government, which we label 'renew', 'complement' and 'integrate'. In particular, the meta-narratives differ in their positioning of how agile interacts with the characteristics of bureaucratic organisations. Importantly, we also show that agile as a management fad serves as a projection surface for what actors want from a modern and digital organisation. Thus, the vocabulary of agile government within the narratives is inherently linked to other diffusing phenomena such as new work or digitalisation.}, language = {en} } @phdthesis{Halfpap2024, author = {Halfpap, Stefan}, title = {Integer linear programming-based heuristics for partially replicated database clusters and selecting indexes}, doi = {10.25932/publishup-63361}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-633615}, school = {Universit{\"a}t Potsdam}, pages = {iii, 185}, year = {2024}, abstract = {Column-oriented database systems can efficiently process transactional and analytical queries on a single node. However, increasing or peak analytical loads can quickly saturate single-node database systems. Then, a common scale-out option is using a database cluster with a single primary node for transaction processing and read-only replicas. Using (the naive) full replication, queries are distributed among nodes independently of the accessed data. This approach is relatively expensive because all nodes must store all data and apply all data modifications caused by inserts, deletes, or updates. In contrast to full replication, partial replication is a more cost-efficient implementation: Instead of duplicating all data to all replica nodes, partial replicas store only a subset of the data while being able to process a large workload share. Besides lower storage costs, partial replicas enable (i) better scaling because replicas must potentially synchronize only subsets of the data modifications and thus have more capacity for read-only queries and (ii) better elasticity because replicas have to load less data and can be set up faster. However, splitting the overall workload evenly among the replica nodes while optimizing the data allocation is a challenging assignment problem. The calculation of optimized data allocations in a partially replicated database cluster can be modeled using integer linear programming (ILP). ILP is a common approach for solving assignment problems, also in the context of database systems. Because ILP is not scalable, existing approaches (also for calculating partial allocations) often fall back to simple (e.g., greedy) heuristics for larger problem instances. Simple heuristics may work well but can lose optimization potential. In this thesis, we present optimal and ILP-based heuristic programming models for calculating data fragment allocations for partially replicated database clusters. Using ILP, we are flexible to extend our models to (i) consider data modifications and reallocations and (ii) increase the robustness of allocations to compensate for node failures and workload uncertainty. We evaluate our approaches for TPC-H, TPC-DS, and a real-world accounting workload and compare the results to state-of-the-art allocation approaches. Our evaluations show significant improvements for varied allocation's properties: Compared to existing approaches, we can, for example, (i) almost halve the amount of allocated data, (ii) improve the throughput in case of node failures and workload uncertainty while using even less memory, (iii) halve the costs of data modifications, and (iv) reallocate less than 90\% of data when adding a node to the cluster. Importantly, we can calculate the corresponding ILP-based heuristic solutions within a few seconds. Finally, we demonstrate that the ideas of our ILP-based heuristics are also applicable to the index selection problem.}, language = {en} } @phdthesis{Richly2024, author = {Richly, Keven}, title = {Memory-efficient data management for spatio-temporal applications}, doi = {10.25932/publishup-63547}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-635473}, school = {Universit{\"a}t Potsdam}, pages = {xii, 181}, year = {2024}, abstract = {The wide distribution of location-acquisition technologies means that large volumes of spatio-temporal data are continuously being accumulated. Positioning systems such as GPS enable the tracking of various moving objects' trajectories, which are usually represented by a chronologically ordered sequence of observed locations. The analysis of movement patterns based on detailed positional information creates opportunities for applications that can improve business decisions and processes in a broad spectrum of industries (e.g., transportation, traffic control, or medicine). Due to the large data volumes generated in these applications, the cost-efficient storage of spatio-temporal data is desirable, especially when in-memory database systems are used to achieve interactive performance requirements. To efficiently utilize the available DRAM capacities, modern database systems support various tuning possibilities to reduce the memory footprint (e.g., data compression) or increase performance (e.g., additional indexes structures). By considering horizontal data partitioning, we can independently apply different tuning options on a fine-grained level. However, the selection of cost and performance-balancing configurations is challenging, due to the vast number of possible setups consisting of mutually dependent individual decisions. In this thesis, we introduce multiple approaches to improve spatio-temporal data management by automatically optimizing diverse tuning options for the application-specific access patterns and data characteristics. Our contributions are as follows: (1) We introduce a novel approach to determine fine-grained table configurations for spatio-temporal workloads. Our linear programming (LP) approach jointly optimizes the (i) data compression, (ii) ordering, (iii) indexing, and (iv) tiering. We propose different models which address cost dependencies at different levels of accuracy to compute optimized tuning configurations for a given workload, memory budgets, and data characteristics. To yield maintainable and robust configurations, we further extend our LP-based approach to incorporate reconfiguration costs as well as optimizations for multiple potential workload scenarios. (2) To optimize the storage layout of timestamps in columnar databases, we present a heuristic approach for the workload-driven combined selection of a data layout and compression scheme. By considering attribute decomposition strategies, we are able to apply application-specific optimizations that reduce the memory footprint and improve performance. (3) We introduce an approach that leverages past trajectory data to improve the dispatch processes of transportation network companies. Based on location probabilities, we developed risk-averse dispatch strategies that reduce critical delays. (4) Finally, we used the use case of a transportation network company to evaluate our database optimizations on a real-world dataset. We demonstrate that workload-driven fine-grained optimizations allow us to reduce the memory footprint (up to 71\% by equal performance) or increase the performance (up to 90\% by equal memory size) compared to established rule-based heuristics. Individually, our contributions provide novel approaches to the current challenges in spatio-temporal data mining and database research. Combining them allows in-memory databases to store and process spatio-temporal data more cost-efficiently.}, language = {en} } @article{XinYingTiberiusAlnooretal.2024, author = {XinYing, Chew and Tiberius, Victor and Alnoor, Alhamzah and Camilleri, Mark and Khaw, Khai Wah}, title = {The dark side of metaverse: a multi-perspective of deviant behaviors from PLS-SEM and fsQCA findings}, series = {International journal of human-computer interaction}, journal = {International journal of human-computer interaction}, publisher = {Taylor \& Francis}, address = {London}, issn = {1044-7318}, doi = {10.1080/10447318.2024.2331875}, pages = {21}, year = {2024}, abstract = {The metaverse has created a huge buzz of interest because such a phenomenon is emerging. The behavioral aspect of the metaverse includes user engagement and deviant behaviors in the metaverse. Such technology has brought various dangers to individuals and society. There are growing cases reported of sexual abuse, racism, harassment, hate speech, and bullying because of online disinhibition make us feel more relaxed. This study responded to the literature call by investigating the effect of technical and social features through mediating roles of security and privacy on deviant behaviors in the metaverse. The data collected from virtual network users reached 1121 respondents. Partial Least Squares based structural equation modeling (PLS-SEM) and fuzzy set Qualitative Comparative Analysis (fsQCA) were used. PLS-SEM results revealed that social features such as user-to-user interaction, homophily, social ties, and social identity, and technical design such as immersive experience and invisibility significantly affect users' deviant behavior in the metaverse. The fsQCA results provided insights into the multiple causal solutions and configurations. This study is exceptional because it provided decisive results by understanding the deviant behavior of users based on the symmetrical and asymmetrical approach to virtual networks.}, language = {en} } @book{MeinelMichaelDengeletal.2024, author = {Meinel, Christoph and Michael, Galbas and Dengel, Andreas and Wendlandt, Matthias}, title = {Konzeption eines integrativen Schulfaches „Digitale Welt" f{\"u}r hessische Schulen}, number = {160}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-582-8}, issn = {1613-5652}, doi = {10.25932/publishup-63911}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-639113}, publisher = {Universit{\"a}t Potsdam}, pages = {34}, year = {2024}, abstract = {Um in der Schule bereits fr{\"u}hzeitig ein Verst{\"a}ndnis f{\"u}r informatische Prozesse zu vermitteln wurde das neue Informatikfach Digitale Welt f{\"u}r die Klassenstufe 5 konzipiert mit der bundesweit einmaligen Verbindung von Informatik mit anwendungsbezogenen und gesellschaftlich relevanten Bez{\"u}gen zur {\"O}kologie und {\"O}konomie. Der Technische Report gibt eine Handreichung zur Einf{\"u}hrung des neuen Faches.}, language = {de} } @inproceedings{GonnermannMuellerTeichmann2024, author = {Gonnermann-M{\"u}ller, Jana and Teichmann, Malte}, title = {Examining the learner's cognitive load in response to different learning material in high and low immersive virtual learning environments}, series = {Information systems and neuroscience}, volume = {68}, booktitle = {Information systems and neuroscience}, editor = {Davis, Fred D. and Riedl, Ren{\´e} and vom Brocke, Jan and L{\´e}ger, Pierre-Majorique and Randolph, Adriane B. and M{\"u}ller-Putz, Gernot R.}, publisher = {Springer}, address = {Cham}, isbn = {978-3-031-58395-7}, doi = {10.1007/978-3-031-58396-4_29}, pages = {333 -- 344}, year = {2024}, abstract = {Learning in virtual, immersive environments must be well-designed to foster learning instead of overwhelming and distracting the learner. So far, learning instructions based on cognitive load theory recommend keeping the learning instructions clean and simple to reduce the extraneous cognitive load of the learner to foster learning performance. The advantages of immersive learning, such as multiple options for realistic simulation, movement and feedback, raise questions about the tension between an increase of excitement and flow with highly realistic environments on the one hand and a reduction of cognitive load by developing clean and simple surroundings on the other hand. This study aims to gain insights into learners' cognitive responses during the learning process by continuously assessing cognitive load through eye-tracking. The experiment compares two distinct immersive learning environments and varying methods of content presentation.}, language = {en} } @article{BaumBaumannBatzel2024, author = {Baum, Katharina and Baumann, Annika and Batzel, Katharina}, title = {Investigating innovation diffusion in gender-specific medicine}, series = {Business \& information systems engineering}, volume = {66}, journal = {Business \& information systems engineering}, number = {3}, publisher = {Springer Fachmedien}, address = {Wiesbaden}, issn = {2363-7005}, doi = {10.1007/s12599-024-00875-6}, pages = {335 -- 355}, year = {2024}, abstract = {The field of healthcare is characterized by constant innovation, with gender-specific medicine emerging as a new subfield that addresses sex and gender disparities in clinical manifestations, outcomes, treatment, and prevention of disease. Despite its importance, the adoption of gender-specific medicine remains understudied, posing potential risks to patient outcomes due to a lack of awareness of the topic. Building on the Innovation Decision Process Theory, this study examines the spread of information about gender-specific medicine in online networks. The study applies social network analysis to a Twitter dataset reflecting online discussions about the topic to gain insights into its adoption by health professionals and patients online. Results show that the network has a community structure with limited information exchange between sub-communities and that mainly medical experts dominate the discussion. The findings suggest that the adoption of gender-specific medicine might be in its early stages, focused on knowledge exchange. Understanding the diffusion of gender-specific medicine among medical professionals and patients may facilitate its adoption and ultimately improve health outcomes.}, language = {en} } @article{AbramovaGladkaya2024, author = {Abramova, Olga and Gladkaya, Margarita}, title = {Behind videoconferencing fatigue at work}, series = {Business \& information systems engineering}, journal = {Business \& information systems engineering}, publisher = {Springer Fachmedien}, address = {Wiesbaden}, issn = {2363-7005}, doi = {10.1007/s12599-024-00874-7}, pages = {19}, year = {2024}, abstract = {A remarkable peculiarity of videoconferencing (VC) applications - the self-view - a.k.a. digital mirror, is examined as a potential reason behind the voiced exhaustion among users. This work draws on technostress research and objective self-awareness theory and proposes the communication role (sender vs. receiver) as an interaction variable. We report the results of two studies among European employees (n1 = 176, n2 = 253) with a one-year time lag. A higher frequency of self-view in a VC when receiving a message, i.e., listening to others, indirectly increases negative affect (study 1 \& 2) and exhaustion (study 2) via the increased state of public self-awareness. Self-viewing in the role of message sender, e.g., as an online presenter, also increases public self-awareness, but its overall effects are less harmful. As for individual differences, users predisposed to public self-consciousness were more concerned with how other VC participants perceived them. Gender effects were insignificant.}, language = {en} } @inproceedings{Grum2024, author = {Grum, Marcus}, title = {Researching multi-site artificial neural networks' activation rates and activation cycles}, series = {Business modeling and software design : 14th International Symposium, BMSD 2024, Luxembourg City, Luxembourg, July 1-3, 2024, proceedings}, booktitle = {Business modeling and software design : 14th International Symposium, BMSD 2024, Luxembourg City, Luxembourg, July 1-3, 2024, proceedings}, editor = {Shishkov, Boris}, publisher = {Springer}, address = {Cham}, isbn = {978-3-031-64072-8}, doi = {10.1007/978-3-031-64073-5_12}, pages = {186 -- 206}, year = {2024}, abstract = {With the further development of more and more production machines into cyber-physical systems, and their greater integration with artificial intelligence (AI) techniques, the coordination of intelligent systems is a highly relevant target factor for the operation and improvement of networked processes, such as they can be found in cross-organizational production contexts spanning multiple distributed locations. This work aims to extend prior research on managing their artificial knowledge transfers as coordination instrument by examining effects of different activation types (respective activation rates and cycles) on by Artificial Neural Network (ANN)-instructed production machines. For this, it provides a new integration type of ANN-based cyber-physical production system as a tool to research artificial knowledge transfers: In a design-science-oriented way, a prototype of a simulation system is constructed as Open Source information system which will be used in on-building research to (I) enable research on ANN activation types in production networks, (II) illustrate ANN-based production networks disrupted by activation types and clarify the need for harmonizing them, and (III) demonstrate conceptual management interventions. This simulator shall establish the importance of site-specific coordination mechanisms and novel forms of management interventions as drivers of efficient artificial knowledge transfer.}, language = {en} }