@article{ThorbrueggeDeselSchaefer2023, author = {Thorbr{\"u}gge, Carsten and Desel, J{\"o}rg and Sch{\"a}fer, Len Ole}, title = {Vorqualifikationen und Anerkennungsoptionen im Informatikstudium}, series = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, journal = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, number = {13}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-548-4}, issn = {1868-0844}, doi = {10.25932/publishup-61394}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-613942}, pages = {73 -- 89}, year = {2023}, abstract = {Viele Informatikstudierende sammeln bereits vor ihrem Studium berufliche Erfahrungen im Informatikbereich, ohne dass diese inhaltlich und didaktisch im Studium ber{\"u}cksichtigt werden. Dieser Beitrag geht der Frage nach, welche Kompetenzen aus beruflichen Vorqualifikationen bei Informatikstudierenden existieren und wie diese in Bezug zu Anerkennungsoptionen gesetzt werden k{\"o}nnen. Betrachtet werden: die pauschale Anerkennung, die auf erworbenen Zertifikaten beruht; die individuelle Anerkennung, bei der individuell erworbene Kompetenzen nachgewiesen werden; die Adaption von individuellen Lernwegen, die Teilkompetenzen der Studierenden ber{\"u}cksichtigt. Es wird eine Interviewstudie vorgestellt, in der Kompetenzen f{\"u}r ein Sample von Informatikstudierenden mit Vorqualifikation als Fachinformatiker/in erhoben und eine Zuordnung zu den Anerkennungsoptionen vorgenommen wurde. F{\"u}r die pr{\"a}zisere Gestaltung von Anerkennungsprozessen und zur kritischen Reflexion der eingesetzten hochschuldidaktischen Konzepte wurde eine empirische Basis geschaffen. Die vorhandenen Konzepte richten sich traditionell an Abiturienten/ innen mit sehr geringem Informatikhintergrund und ber{\"u}cksichtigen die tats{\"a}chlich existierende Heterogenit{\"a}t der Studienanf{\"a}nger/innen nicht angemessen. Die Ergebnisse zeigen, dass die Befragten aus ihrer Vorqualifikation relevante fachliche Kompetenzen mitbringen, die mit den Anerkennungsoptionen korrespondieren und deren Weiterentwicklung dienen k{\"o}nnen. Dar{\"u}ber hinaus werden aus {\"u}berfachlichen Kompetenzen wie Selbststeuerungskompetenzen weitere Erkenntnisse zur Studiengestaltung gewonnen.}, language = {de} } @article{Weicker2023, author = {Weicker, Karsten}, title = {Peer-Review als Katalysator im Lernprozess}, series = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, journal = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, number = {13}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-548-4}, issn = {1868-0844}, doi = {10.25932/publishup-61602}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-616025}, pages = {257 -- 277}, year = {2023}, abstract = {Peer-Reviews werden seit geraumer Zeit in unterschiedlichen Lehrszenarien eingesetzt. In diesem Paper wird untersucht, inwieweit das Peer- Review die Auseinandersetzung mit den Inhalten eines Grundlagenmoduls in einem pr{\"a}senzfreien Lehrszenario bef{\"o}rdern kann. Dabei scheint in den Ergebnissen die Qualit{\"a}t der selbst erstellten Reviews einer der wichtigsten Einflussfaktoren f{\"u}r den Lernerfolg zu sein, w{\"a}hrend Experten-Feedback und weitere Faktoren deutlich untergeordnet erscheinen. Die F{\"a}higkeit ausf{\"u}hrliche Peer-Reviews zu verfassen geht einher mit dem Erwerb von fachlicher Kompetenz bzw. entsprechenden fachlichen Vorkenntnissen.}, language = {de} } @article{OpelNetzerDesel2023, author = {Opel, Simone and Netzer, Cajus Marian and Desel, J{\"o}rg}, title = {Adaption von Lernwegen in adaptierten Lehrmaterialien f{\"u}r Studierende mit Berufsausbildungsabschluss}, series = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, journal = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, number = {13}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-548-4}, issn = {1868-0844}, doi = {10.25932/publishup-61418}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-614188}, pages = {91 -- 114}, year = {2023}, abstract = {Obwohl immer mehr Menschen nicht direkt ein Studium aufnehmen, sondern zuvor eine berufliche Ausbildung absolvieren, werden die in der Ausbildung erworbenen Kompetenzen von den Hochschulen inhaltlich und didaktisch meist ignoriert. Ein Ansatz, diese Kompetenzen zu w{\"u}rdigen, ist die formale Anrechnung von mitgebrachten Kompetenzen als (f{\"u}r den Studienabschluss erforderliche) Leistungspunkte. Eine andere Variante ist der Einsatz von speziell f{\"u}r die Zielgruppe der Studierenden mit Vorkenntnissen adaptiertem Lehr-Lernmaterial. Um dar{\"u}ber hinaus individuelle Unterschiede zu ber{\"u}cksichtigen, erlaubt eine weitere Adaption individueller Lernpfade den Lernenden, genau die jeweils fehlenden Kompetenzen zu erwerben. In diesem Beitrag stellen wir die exemplarische Entwicklung derartigen Materials anhand des Kurses „Datenbanken" f{\"u}r die Zielgruppe der Studierenden mit einer abgeschlossenen Ausbildung zum Fachinformatiker bzw. zur Fachinformatikerin vor.}, language = {de} } @article{Weber2023, author = {Weber, Gerhard}, title = {Informatik und Barrierefreiheit}, series = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, journal = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, number = {13}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-548-4}, issn = {1868-0844}, doi = {10.25932/publishup-61387}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-613874}, pages = {35 -- 50}, year = {2023}, abstract = {Barrierefreiheit kann durch Methoden der Informatik hergestellt und ausgebaut werden. Dieser eingeladene Beitrag stellt die Anforderungen von Menschen mit den umfangreichsten Benutzererfordernissen an Software vor, die z. B. eigene Schriftsysteme wie Braille und entsprechende taktile Ausgabeger{\"a}te verwenden. Assistive Technologien umfassen dabei auch Software verschiedenster Art. Es werden die wichtigsten Kompetenzen daf{\"u}r vorgestellt. Im Curriculum der Informatik k{\"o}nnen diese Kompetenzen im Rahmen von speziellen Vorlesungen und {\"U}bungen vermittelt werden oder sie werden in die jeweiligen Fachgebiete integriert. Um den Studienbetrieb ebenfalls barrierefrei zu gestalten, sind weitere Anstrengungen notwendig, die Lehrende, Verwaltung und die Hochschulleitung einbeziehen.}, language = {de} } @article{SchellSchwill2023, author = {Schell, Timon and Schwill, Andreas}, title = {„Es ist kompliziert, alles inklusive Privatleben unter einen Hut zu bekommen"}, series = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, journal = {Hochschuldidaktik Informatik HDI 2021 (Commentarii informaticae didacticae)}, number = {13}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-548-4}, issn = {1868-0844}, doi = {10.25932/publishup-61388}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-613882}, pages = {53 -- 71}, year = {2023}, abstract = {Eine {\"u}bliche Erz{\"a}hlung verkn{\"u}pft lange Studienzeiten und hohe Abbrecherquoten im Informatikstudium zum einen mit der sehr gut bezahlten Nebent{\"a}tigkeit von Studierenden in der Informatikbranche, die deutlich studienzeitverl{\"a}ngernd sei; zum anderen werde wegen des hohen Bedarfs an Informatikern ein formeller Studienabschluss von den Studierenden h{\"a}ufig als entbehrlich betrachtet und eine Karriere in der Informatikbranche ohne abgeschlossenes Studium begonnen. In dieser Studie, durchgef{\"u}hrt an der Universit{\"a}t Potsdam, untersuchen wir, wie viele Informatikstudierende neben dem Studium innerhalb und außerhalb der Informatikbranche arbeiten, welche Erwartungen sie neben der Bezahlung damit verbinden und wie sich die T{\"a}tigkeit auf ihr Studium und ihre sp{\"a}tere berufliche Perspektive auswirkt. Aus aktuellem Anlass interessieren uns auch die Auswirkungen der Covid-19-Pandemie auf die Arbeitst{\"a}tigkeiten der Informatikstudierenden.}, language = {de} } @phdthesis{SchulzHanke2023, author = {Schulz-Hanke, Christian}, title = {BCH Codes mit kombinierter Korrektur und Erkennung}, doi = {10.25932/publishup-61794}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-617943}, school = {Universit{\"a}t Potsdam}, pages = {ii, 191}, year = {2023}, abstract = {BCH Codes mit kombinierter Korrektur und Erkennung In dieser Arbeit wird auf Grundlage des BCH Codes untersucht, wie eine Fehlerkorrektur mit einer Erkennung h{\"o}herer Fehleranzahlen kombiniert werden kann. Mit dem Verfahren der 1-Bit Korrektur mit zus{\"a}tzlicher Erkennung h{\"o}herer Fehler wurde ein Ansatz entwickelt, welcher die Erkennung zus{\"a}tzlicher Fehler durch das parallele L{\"o}sen einfacher Gleichungen der Form s_x = s_1^x durchf{\"u}hrt. Die Anzahl dieser Gleichungen ist linear zu der Anzahl der zu {\"u}berpr{\"u}fenden h{\"o}heren Fehler. In dieser Arbeit wurde zus{\"a}tzlich f{\"u}r bis zu 4-Bit Korrekturen mit zus{\"a}tzlicher Erkennung h{\"o}herer Fehler ein weiterer allgemeiner Ansatz vorgestellt. Dabei werden parallel f{\"u}r alle korrigierbaren Fehleranzahlen spekulative Fehlerkorrekturen durchgef{\"u}hrt. Aus den bestimmten Fehlerstellen werden spekulative Syndromkomponenten erzeugt, durch welche die Fehlerstellen best{\"a}tigt und h{\"o}here erkennbare Fehleranzahlen ausgeschlossen werden k{\"o}nnen. Die vorgestellten Ans{\"a}tze unterscheiden sich von dem in entwickelten Ansatz, bei welchem die Anzahl der Fehlerstellen durch die Berechnung von Determinanten in absteigender Reihenfolge berechnet wird, bis die erste Determinante 0 bildet. Bei dem bekannten Verfahren ist durch die Berechnung der Determinanten eine faktorielle Anzahl an Berechnungen in Relation zu der Anzahl zu {\"u}berpr{\"u}fender Fehler durchzuf{\"u}hren. Im Vergleich zu dem bekannten sequentiellen Verfahrens nach Berlekamp Massey besitzen die Berechnungen im vorgestellten Ansatz simple Gleichungen und k{\"o}nnen parallel durchgef{\"u}hrt werden.Bei dem bekannten Verfahren zur parallelen Korrektur von 4-Bit Fehlern ist eine Gleichung vierten Grades im GF(2^m) zu l{\"o}sen. Dies erfolgt, indem eine Hilfsgleichung dritten Grades und vier Gleichungen zweiten Grades parallel gel{\"o}st werden. In der vorliegenden Arbeit wurde gezeigt, dass sich eine Gleichung zweiten Grades einsparen l{\"a}sst, wodurch sich eine Vereinfachung der Hardware bei einer parallelen Realisierung der 4-Bit Korrektur ergibt. Die erzielten Ergebnisse wurden durch umfangreiche Simulationen in Software und Hardwareimplementierungen {\"u}berpr{\"u}ft.}, language = {de} } @inproceedings{VladovaUllrichSultanowetal.2023, author = {Vladova, Gergana and Ullrich, Andr{\´e} and Sultanow, Eldar and Tobolla, Marinho and Sebrak, Sebastian and Czarnecki, Christian and Brockmann, Carsten}, title = {Visual analytics for knowledge management}, series = {Informatik 2023}, booktitle = {Informatik 2023}, editor = {Klein, Maike and Krupka, Daniel and Winter, Cornelia and Wohlgemuth, Volker}, publisher = {Gesellschaft f{\"u}r Informatik e.V. (GI)}, address = {Bonn}, isbn = {978-3-88579-731-9}, issn = {1617-5468}, doi = {10.18420/inf2023_187}, pages = {1851 -- 1870}, year = {2023}, abstract = {The management of knowledge in organizations considers both established long-term processes and cooperation in agile project teams. Since knowledge can be both tacit and explicit, its transfer from the individual to the organizational knowledge base poses a challenge in organizations. This challenge increases when the fluctuation of knowledge carriers is exceptionally high. Especially in large projects in which external consultants are involved, there is a risk that critical, company-relevant knowledge generated in the project will leave the company with the external knowledge carrier and thus be lost. In this paper, we show the advantages of an early warning system for knowledge management to avoid this loss. In particular, the potential of visual analytics in the context of knowledge management systems is presented and discussed. We present a project for the development of a business-critical software system and discuss the first implementations and results.}, language = {en} } @article{HagemannAbramova2023, author = {Hagemann, Linus and Abramova, Olga}, title = {Emotions and information diffusion on social media}, series = {AIS transactions on replication research}, volume = {9}, journal = {AIS transactions on replication research}, number = {1}, publisher = {AIS}, address = {Atlanta}, issn = {2473-3458}, doi = {10.17705/1atrr.00079}, pages = {1 -- 19}, year = {2023}, abstract = {This paper presents a methodological and conceptual replication of Stieglitz and Dang-Xuan's (2013) investigation of the role of sentiment in information-sharing behavior on social media. Whereas Stieglitz and Dang-Xuan (2013) focused on Twitter communication prior to the state parliament elections in the German states Baden-Wurttemberg, Rheinland-Pfalz, and Berlin in 2011, we test their theoretical propositions in the context of the state parliament elections in Saxony-Anhalt (Germany) 2021. We confirm the positive link between sentiment in a political Twitter message and its number of retweets in a methodological replication. In a conceptual replication, where sentiment was assessed with the alternative dictionary-based tool LIWC, the sentiment was negatively associated with the retweet volume. In line with the original study, the strength of association between sentiment and retweet time lag insignificantly differs between tweets with negative sentiment and tweets with positive sentiment. We also found that the number of an author's followers was an essential determinant of sharing behavior. However, two hypotheses supported in the original study did not hold for our sample. Precisely, the total amount of sentiments was insignificantly linked to the time lag to the first retweet. Finally, in our data, we do not observe that the association between the overall sentiment and retweet quantity is stronger for tweets with negative sentiment than for those with positive sentiment.}, language = {en} } @article{PuriVardeMelo2023, author = {Puri, Manish and Varde, Aparna S. and Melo, Gerard de}, title = {Commonsense based text mining on urban policy}, series = {Language resources and evaluation}, volume = {57}, journal = {Language resources and evaluation}, publisher = {Springer}, address = {Dordrecht [u.a.]}, issn = {1574-020X}, doi = {10.1007/s10579-022-09584-6}, pages = {733 -- 763}, year = {2023}, abstract = {Local laws on urban policy, i.e., ordinances directly affect our daily life in various ways (health, business etc.), yet in practice, for many citizens they remain impervious and complex. This article focuses on an approach to make urban policy more accessible and comprehensible to the general public and to government officials, while also addressing pertinent social media postings. Due to the intricacies of the natural language, ranging from complex legalese in ordinances to informal lingo in tweets, it is practical to harness human judgment here. To this end, we mine ordinances and tweets via reasoning based on commonsense knowledge so as to better account for pragmatics and semantics in the text. Ours is pioneering work in ordinance mining, and thus there is no prior labeled training data available for learning. This gap is filled by commonsense knowledge, a prudent choice in situations involving a lack of adequate training data. The ordinance mining can be beneficial to the public in fathoming policies and to officials in assessing policy effectiveness based on public reactions. This work contributes to smart governance, leveraging transparency in governing processes via public involvement. We focus significantly on ordinances contributing to smart cities, hence an important goal is to assess how well an urban region heads towards a smart city as per its policies mapping with smart city characteristics, and the corresponding public satisfaction.}, language = {en} } @article{GarrelsKhodabakhshRenardetal.2023, author = {Garrels, Tim and Khodabakhsh, Athar and Renard, Bernhard Y. and Baum, Katharina}, title = {LazyFox: fast and parallelized overlapping community detection in large graphs}, series = {PEERJ Computer Science}, volume = {9}, journal = {PEERJ Computer Science}, publisher = {PeerJ Inc.}, address = {London}, issn = {2376-5992}, doi = {10.7717/peerj-cs.1291}, pages = {30}, year = {2023}, abstract = {The detection of communities in graph datasets provides insight about a graph's underlying structure and is an important tool for various domains such as social sciences, marketing, traffic forecast, and drug discovery. While most existing algorithms provide fast approaches for community detection, their results usually contain strictly separated communities. However, most datasets would semantically allow for or even require overlapping communities that can only be determined at much higher computational cost. We build on an efficient algorithm, FOX, that detects such overlapping communities. FOX measures the closeness of a node to a community by approximating the count of triangles which that node forms with that community. We propose LAZYFOX, a multi-threaded adaptation of the FOX algorithm, which provides even faster detection without an impact on community quality. This allows for the analyses of significantly larger and more complex datasets. LAZYFOX enables overlapping community detection on complex graph datasets with millions of nodes and billions of edges in days instead of weeks. As part of this work, LAZYFOX's implementation was published and is available as a tool under an MIT licence at https://github.com/TimGarrels/LazyFox.}, language = {en} } @book{KubanRottaNolteetal.2023, author = {Kuban, Robert and Rotta, Randolf and Nolte, J{\"o}rg and Chromik, Jonas and Beilharz, Jossekin Jakob and Pirl, Lukas and Friedrich, Tobias and Lenzner, Pascal and Weyand, Christopher and Juiz, Carlos and Bermejo, Belen and Sauer, Joao and Coelh, Leandro dos Santos and Najafi, Pejman and P{\"u}nter, Wenzel and Cheng, Feng and Meinel, Christoph and Sidorova, Julia and Lundberg, Lars and Vogel, Thomas and Tran, Chinh and Moser, Irene and Grunske, Lars and Elsaid, Mohamed Esameldin Mohamed and Abbas, Hazem M. and Rula, Anisa and Sejdiu, Gezim and Maurino, Andrea and Schmidt, Christopher and H{\"u}gle, Johannes and Uflacker, Matthias and Nozza, Debora and Messina, Enza and Hoorn, Andr{\´e} van and Frank, Markus and Schulz, Henning and Alhosseini Almodarresi Yasin, Seyed Ali and Nowicki, Marek and Muite, Benson K. and Boysan, Mehmet Can and Bianchi, Federico and Cremaschi, Marco and Moussa, Rim and Abdel-Karim, Benjamin M. and Pfeuffer, Nicolas and Hinz, Oliver and Plauth, Max and Polze, Andreas and Huo, Da and Melo, Gerard de and Mendes Soares, F{\´a}bio and Oliveira, Roberto C{\´e}lio Lim{\~a}o de and Benson, Lawrence and Paul, Fabian and Werling, Christian and Windheuser, Fabian and Stojanovic, Dragan and Djordjevic, Igor and Stojanovic, Natalija and Stojnev Ilic, Aleksandra and Weidmann, Vera and Lowitzki, Leon and Wagner, Markus and Ifa, Abdessatar Ben and Arlos, Patrik and Megia, Ana and Vendrell, Joan and Pfitzner, Bjarne and Redondo, Alberto and R{\´i}os Insua, David and Albert, Justin Amadeus and Zhou, Lin and Arnrich, Bert and Szab{\´o}, Ildik{\´o} and Fodor, Szabina and Ternai, Katalin and Bhowmik, Rajarshi and Campero Durand, Gabriel and Shevchenko, Pavlo and Malysheva, Milena and Prymak, Ivan and Saake, Gunter}, title = {HPI Future SOC Lab - Proceedings 2019}, number = {158}, editor = {Meinel, Christoph and Polze, Andreas and Beins, Karsten and Strotmann, Rolf and Seibold, Ulrich and R{\"o}dszus, Kurt and M{\"u}ller, J{\"u}rgen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-564-4}, issn = {1613-5652}, doi = {10.25932/publishup-59791}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-597915}, publisher = {Universit{\"a}t Potsdam}, pages = {xi, 301}, year = {2023}, abstract = {The "HPI Future SOC Lab" is a cooperation of the Hasso Plattner Institute (HPI) and industry partners. Its mission is to enable and promote exchange and interaction between the research community and the industry partners. The HPI Future SOC Lab provides researchers with free of charge access to a complete infrastructure of state of the art hard and software. This infrastructure includes components, which might be too expensive for an ordinary research environment, such as servers with up to 64 cores and 2 TB main memory. The offerings address researchers particularly from but not limited to the areas of computer science and business information systems. Main areas of research include cloud computing, parallelization, and In-Memory technologies. This technical report presents results of research projects executed in 2019. Selected projects have presented their results on April 9th and November 12th 2019 at the Future SOC Lab Day events.}, language = {en} } @misc{KonigorskiWernickeSlosareketal.2023, author = {Konigorski, Stefan and Wernicke, Sarah and Slosarek, Tamara and Zenner, Alexander Maximilian and Strelow, Nils and Ruether, Darius Ferenc and Henschel, Florian and Manaswini, Manisha and Pottb{\"a}cker, Fabian and Edelman, Jonathan Antonio and Owoyele, Babajide and Danieletto, Matteo and Golden, Eddye and Zweig, Micol and Nadkarni, Girish N. and B{\"o}ttinger, Erwin}, title = {StudyU: A Platform for Designing and Conducting Innovative Digital N-of-1 Trials}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, number = {12}, doi = {10.25932/publishup-58037}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-580370}, pages = {12}, year = {2023}, abstract = {N-of-1 trials are the gold standard study design to evaluate individual treatment effects and derive personalized treatment strategies. Digital tools have the potential to initiate a new era of N-of-1 trials in terms of scale and scope, but fully functional platforms are not yet available. Here, we present the open source StudyU platform, which includes the StudyU Designer and StudyU app. With the StudyU Designer, scientists are given a collaborative web application to digitally specify, publish, and conduct N-of-1 trials. The StudyU app is a smartphone app with innovative user-centric elements for participants to partake in trials published through the StudyU Designer to assess the effects of different interventions on their health. Thereby, the StudyU platform allows clinicians and researchers worldwide to easily design and conduct digital N-of-1 trials in a safe manner. We envision that StudyU can change the landscape of personalized treatments both for patients and healthy individuals, democratize and personalize evidence generation for self-optimization and medicine, and can be integrated in clinical practice.}, language = {en} } @article{VitaglianoHameedJiangetal.2023, author = {Vitagliano, Gerardo and Hameed, Mazhar and Jiang, Lan and Reisener, Lucas and Wu, Eugene and Naumann, Felix}, title = {Pollock: a data loading benchmark}, series = {Proceedings of the VLDB Endowment}, volume = {16}, journal = {Proceedings of the VLDB Endowment}, number = {8}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3594512.3594518}, pages = {1870 -- 1882}, year = {2023}, abstract = {Any system at play in a data-driven project has a fundamental requirement: the ability to load data. The de-facto standard format to distribute and consume raw data is CSV. Yet, the plain text and flexible nature of this format make such files often difficult to parse and correctly load their content, requiring cumbersome data preparation steps. We propose a benchmark to assess the robustness of systems in loading data from non-standard CSV formats and with structural inconsistencies. First, we formalize a model to describe the issues that affect real-world files and use it to derive a systematic lpollutionz process to generate dialects for any given grammar. Our benchmark leverages the pollution framework for the csv format. To guide pollution, we have surveyed thousands of real-world, publicly available csv files, recording the problems we encountered. We demonstrate the applicability of our benchmark by testing and scoring 16 different systems: popular csv parsing frameworks, relational database tools, spreadsheet systems, and a data visualization tool.}, language = {en} } @phdthesis{Vitagliano2024, author = {Vitagliano, Gerardo}, title = {Modeling the structure of tabular files for data preparation}, doi = {10.25932/publishup-62435}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624351}, school = {Universit{\"a}t Potsdam}, pages = {ii, 114}, year = {2024}, abstract = {To manage tabular data files and leverage their content in a given downstream task, practitioners often design and execute complex transformation pipelines to prepare them. The complexity of such pipelines stems from different factors, including the nature of the preparation tasks, often exploratory or ad-hoc to specific datasets; the large repertory of tools, algorithms, and frameworks that practitioners need to master; and the volume, variety, and velocity of the files to be prepared. Metadata plays a fundamental role in reducing this complexity: characterizing a file assists end users in the design of data preprocessing pipelines, and furthermore paves the way for suggestion, automation, and optimization of data preparation tasks. Previous research in the areas of data profiling, data integration, and data cleaning, has focused on extracting and characterizing metadata regarding the content of tabular data files, i.e., about the records and attributes of tables. Content metadata are useful for the latter stages of a preprocessing pipeline, e.g., error correction, duplicate detection, or value normalization, but they require a properly formed tabular input. Therefore, these metadata are not relevant for the early stages of a preparation pipeline, i.e., to correctly parse tables out of files. In this dissertation, we turn our focus to what we call the structure of a tabular data file, i.e., the set of characters within a file that do not represent data values but are required to parse and understand the content of the file. We provide three different approaches to represent file structure, an explicit representation based on context-free grammars; an implicit representation based on file-wise similarity; and a learned representation based on machine learning. In our first contribution, we use the grammar-based representation to characterize a set of over 3000 real-world csv files and identify multiple structural issues that let files deviate from the csv standard, e.g., by having inconsistent delimiters or containing multiple tables. We leverage our learnings about real-world files and propose Pollock, a benchmark to test how well systems parse csv files that have a non-standard structure, without any previous preparation. We report on our experiments on using Pollock to evaluate the performance of 16 real-world data management systems. Following, we characterize the structure of files implicitly, by defining a measure of structural similarity for file pairs. We design a novel algorithm to compute this measure, which is based on a graph representation of the files' content. We leverage this algorithm and propose Mondrian, a graphical system to assist users in identifying layout templates in a dataset, classes of files that have the same structure, and therefore can be prepared by applying the same preparation pipeline. Finally, we introduce MaGRiTTE, a novel architecture that uses self-supervised learning to automatically learn structural representations of files in the form of vectorial embeddings at three different levels: cell level, row level, and file level. We experiment with the application of structural embeddings for several tasks, namely dialect detection, row classification, and data preparation efforts estimation. Our experimental results show that structural metadata, either identified explicitly on parsing grammars, derived implicitly as file-wise similarity, or learned with the help of machine learning architectures, is fundamental to automate several tasks, to scale up preparation to large quantities of files, and to provide repeatable preparation pipelines.}, language = {en} } @phdthesis{Huegle2024, author = {Huegle, Johannes}, title = {Causal discovery in practice: Non-parametric conditional independence testing and tooling for causal discovery}, doi = {10.25932/publishup-63582}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-635820}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 156}, year = {2024}, abstract = {Knowledge about causal structures is crucial for decision support in various domains. For example, in discrete manufacturing, identifying the root causes of failures and quality deviations that interrupt the highly automated production process requires causal structural knowledge. However, in practice, root cause analysis is usually built upon individual expert knowledge about associative relationships. But, "correlation does not imply causation", and misinterpreting associations often leads to incorrect conclusions. Recent developments in methods for causal discovery from observational data have opened the opportunity for a data-driven examination. Despite its potential for data-driven decision support, omnipresent challenges impede causal discovery in real-world scenarios. In this thesis, we make a threefold contribution to improving causal discovery in practice. (1) The growing interest in causal discovery has led to a broad spectrum of methods with specific assumptions on the data and various implementations. Hence, application in practice requires careful consideration of existing methods, which becomes laborious when dealing with various parameters, assumptions, and implementations in different programming languages. Additionally, evaluation is challenging due to the lack of ground truth in practice and limited benchmark data that reflect real-world data characteristics. To address these issues, we present a platform-independent modular pipeline for causal discovery and a ground truth framework for synthetic data generation that provides comprehensive evaluation opportunities, e.g., to examine the accuracy of causal discovery methods in case of inappropriate assumptions. (2) Applying constraint-based methods for causal discovery requires selecting a conditional independence (CI) test, which is particularly challenging in mixed discrete-continuous data omnipresent in many real-world scenarios. In this context, inappropriate assumptions on the data or the commonly applied discretization of continuous variables reduce the accuracy of CI decisions, leading to incorrect causal structures. Therefore, we contribute a non-parametric CI test leveraging k-nearest neighbors methods and prove its statistical validity and power in mixed discrete-continuous data, as well as the asymptotic consistency when used in constraint-based causal discovery. An extensive evaluation of synthetic and real-world data shows that the proposed CI test outperforms state-of-the-art approaches in the accuracy of CI testing and causal discovery, particularly in settings with low sample sizes. (3) To show the applicability and opportunities of causal discovery in practice, we examine our contributions in real-world discrete manufacturing use cases. For example, we showcase how causal structural knowledge helps to understand unforeseen production downtimes or adds decision support in case of failures and quality deviations in automotive body shop assembly lines.}, language = {en} } @phdthesis{Halfpap2024, author = {Halfpap, Stefan}, title = {Integer linear programming-based heuristics for partially replicated database clusters and selecting indexes}, doi = {10.25932/publishup-63361}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-633615}, school = {Universit{\"a}t Potsdam}, pages = {iii, 185}, year = {2024}, abstract = {Column-oriented database systems can efficiently process transactional and analytical queries on a single node. However, increasing or peak analytical loads can quickly saturate single-node database systems. Then, a common scale-out option is using a database cluster with a single primary node for transaction processing and read-only replicas. Using (the naive) full replication, queries are distributed among nodes independently of the accessed data. This approach is relatively expensive because all nodes must store all data and apply all data modifications caused by inserts, deletes, or updates. In contrast to full replication, partial replication is a more cost-efficient implementation: Instead of duplicating all data to all replica nodes, partial replicas store only a subset of the data while being able to process a large workload share. Besides lower storage costs, partial replicas enable (i) better scaling because replicas must potentially synchronize only subsets of the data modifications and thus have more capacity for read-only queries and (ii) better elasticity because replicas have to load less data and can be set up faster. However, splitting the overall workload evenly among the replica nodes while optimizing the data allocation is a challenging assignment problem. The calculation of optimized data allocations in a partially replicated database cluster can be modeled using integer linear programming (ILP). ILP is a common approach for solving assignment problems, also in the context of database systems. Because ILP is not scalable, existing approaches (also for calculating partial allocations) often fall back to simple (e.g., greedy) heuristics for larger problem instances. Simple heuristics may work well but can lose optimization potential. In this thesis, we present optimal and ILP-based heuristic programming models for calculating data fragment allocations for partially replicated database clusters. Using ILP, we are flexible to extend our models to (i) consider data modifications and reallocations and (ii) increase the robustness of allocations to compensate for node failures and workload uncertainty. We evaluate our approaches for TPC-H, TPC-DS, and a real-world accounting workload and compare the results to state-of-the-art allocation approaches. Our evaluations show significant improvements for varied allocation's properties: Compared to existing approaches, we can, for example, (i) almost halve the amount of allocated data, (ii) improve the throughput in case of node failures and workload uncertainty while using even less memory, (iii) halve the costs of data modifications, and (iv) reallocate less than 90\% of data when adding a node to the cluster. Importantly, we can calculate the corresponding ILP-based heuristic solutions within a few seconds. Finally, we demonstrate that the ideas of our ILP-based heuristics are also applicable to the index selection problem.}, language = {en} } @inproceedings{RojahnGronau2024, author = {Rojahn, Marcel and Gronau, Norbert}, title = {Openness indicators for the evaluation of digital platforms between the launch and maturity phase}, series = {Proceedings of the 57th Annual Hawaii International Conference on System Sciences}, booktitle = {Proceedings of the 57th Annual Hawaii International Conference on System Sciences}, editor = {Bui, Tung X.}, publisher = {Department of IT Management Shidler College of Business University of Hawaii}, address = {Honolulu, HI}, isbn = {978-0-99813-317-1}, pages = {4516 -- 4525}, year = {2024}, abstract = {In recent years, the evaluation of digital platforms has become an important focus in the field of information systems science. The identification of influential indicators that drive changes in digital platforms, specifically those related to openness, is still an unresolved issue. This paper addresses the challenge of identifying measurable indicators and characterizing the transition from launch to maturity in digital platforms. It proposes a systematic analytical approach to identify relevant openness indicators for evaluation purposes. The main contributions of this study are the following (1) the development of a comprehensive procedure for analyzing indicators, (2) the categorization of indicators as evaluation metrics within a multidimensional grid-box model, (3) the selection and evaluation of relevant indicators, (4) the identification and assessment of digital platform architectures during the launch-to-maturity transition, and (5) the evaluation of the applicability of the conceptualization and design process for digital platform evaluation.}, language = {en} } @phdthesis{Richly2024, author = {Richly, Keven}, title = {Memory-efficient data management for spatio-temporal applications}, doi = {10.25932/publishup-63547}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-635473}, school = {Universit{\"a}t Potsdam}, pages = {xii, 181}, year = {2024}, abstract = {The wide distribution of location-acquisition technologies means that large volumes of spatio-temporal data are continuously being accumulated. Positioning systems such as GPS enable the tracking of various moving objects' trajectories, which are usually represented by a chronologically ordered sequence of observed locations. The analysis of movement patterns based on detailed positional information creates opportunities for applications that can improve business decisions and processes in a broad spectrum of industries (e.g., transportation, traffic control, or medicine). Due to the large data volumes generated in these applications, the cost-efficient storage of spatio-temporal data is desirable, especially when in-memory database systems are used to achieve interactive performance requirements. To efficiently utilize the available DRAM capacities, modern database systems support various tuning possibilities to reduce the memory footprint (e.g., data compression) or increase performance (e.g., additional indexes structures). By considering horizontal data partitioning, we can independently apply different tuning options on a fine-grained level. However, the selection of cost and performance-balancing configurations is challenging, due to the vast number of possible setups consisting of mutually dependent individual decisions. In this thesis, we introduce multiple approaches to improve spatio-temporal data management by automatically optimizing diverse tuning options for the application-specific access patterns and data characteristics. Our contributions are as follows: (1) We introduce a novel approach to determine fine-grained table configurations for spatio-temporal workloads. Our linear programming (LP) approach jointly optimizes the (i) data compression, (ii) ordering, (iii) indexing, and (iv) tiering. We propose different models which address cost dependencies at different levels of accuracy to compute optimized tuning configurations for a given workload, memory budgets, and data characteristics. To yield maintainable and robust configurations, we further extend our LP-based approach to incorporate reconfiguration costs as well as optimizations for multiple potential workload scenarios. (2) To optimize the storage layout of timestamps in columnar databases, we present a heuristic approach for the workload-driven combined selection of a data layout and compression scheme. By considering attribute decomposition strategies, we are able to apply application-specific optimizations that reduce the memory footprint and improve performance. (3) We introduce an approach that leverages past trajectory data to improve the dispatch processes of transportation network companies. Based on location probabilities, we developed risk-averse dispatch strategies that reduce critical delays. (4) Finally, we used the use case of a transportation network company to evaluate our database optimizations on a real-world dataset. We demonstrate that workload-driven fine-grained optimizations allow us to reduce the memory footprint (up to 71\% by equal performance) or increase the performance (up to 90\% by equal memory size) compared to established rule-based heuristics. Individually, our contributions provide novel approaches to the current challenges in spatio-temporal data mining and database research. Combining them allows in-memory databases to store and process spatio-temporal data more cost-efficiently.}, language = {en} } @phdthesis{Taleb2024, author = {Taleb, Aiham}, title = {Self-supervised deep learning methods for medical image analysis}, doi = {10.25932/publishup-64408}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-644089}, school = {Universit{\"a}t Potsdam}, pages = {xii, 171}, year = {2024}, abstract = {Deep learning has seen widespread application in many domains, mainly for its ability to learn data representations from raw input data. Nevertheless, its success has so far been coupled with the availability of large annotated (labelled) datasets. This is a requirement that is difficult to fulfil in several domains, such as in medical imaging. Annotation costs form a barrier in extending deep learning to clinically-relevant use cases. The labels associated with medical images are scarce, since the generation of expert annotations of multimodal patient data at scale is non-trivial, expensive, and time-consuming. This substantiates the need for algorithms that learn from the increasing amounts of unlabeled data. Self-supervised representation learning algorithms offer a pertinent solution, as they allow solving real-world (downstream) deep learning tasks with fewer annotations. Self-supervised approaches leverage unlabeled samples to acquire generic features about different concepts, enabling annotation-efficient downstream task solving subsequently. Nevertheless, medical images present multiple unique and inherent challenges for existing self-supervised learning approaches, which we seek to address in this thesis: (i) medical images are multimodal, and their multiple modalities are heterogeneous in nature and imbalanced in quantities, e.g. MRI and CT; (ii) medical scans are multi-dimensional, often in 3D instead of 2D; (iii) disease patterns in medical scans are numerous and their incidence exhibits a long-tail distribution, so it is oftentimes essential to fuse knowledge from different data modalities, e.g. genomics or clinical data, to capture disease traits more comprehensively; (iv) Medical scans usually exhibit more uniform color density distributions, e.g. in dental X-Rays, than natural images. Our proposed self-supervised methods meet these challenges, besides significantly reducing the amounts of required annotations. We evaluate our self-supervised methods on a wide array of medical imaging applications and tasks. Our experimental results demonstrate the obtained gains in both annotation-efficiency and performance; our proposed methods outperform many approaches from related literature. Additionally, in case of fusion with genetic modalities, our methods also allow for cross-modal interpretability. In this thesis, not only we show that self-supervised learning is capable of mitigating manual annotation costs, but also our proposed solutions demonstrate how to better utilize it in the medical imaging domain. Progress in self-supervised learning has the potential to extend deep learning algorithms application to clinical scenarios.}, language = {en} } @inproceedings{MarxBruenkerMirbabaieetal.2024, author = {Marx, Julian and Br{\"u}nker, Felix and Mirbabaie, Milad and Stieglitz, Stefan}, title = {Digital activism on social media}, series = {Proceedings of the 57th Annual Hawaii International Conference on System Sciences}, booktitle = {Proceedings of the 57th Annual Hawaii International Conference on System Sciences}, editor = {Bui, Tung X.}, publisher = {Department of IT Management Shidler College of Business University of Hawaii}, address = {Honolulu, HI}, isbn = {978-0-99813-317-1}, pages = {7205 -- 7214}, year = {2024}, abstract = {Social media constitute an important arena for public debates and steady interchange of issues relevant to society. To boost their reputation, commercial organizations also engage in political, social, or environmental debates on social media. To engage in this type of digital activism, organizations increasingly utilize the social media profiles of executive employees and other brand ambassadors. However, the relationship between brand ambassadors' digital activism and corporate reputation is only vaguely understood. The results of a qualitative inquiry suggest that digital activism via brand ambassadors can be risky (e.g., creating additional surface for firestorms, financial loss) and rewarding (e.g., emitting authenticity, employing 'megaphones' for industry change) at the same time. The paper informs both scholarship and practitioners about strategic trade-offs that need to be considered when employing brand ambassadors for digital activism.}, language = {en} }