@article{ŞahinEgloffsteinBotheetal.2021, author = {Şahin, Muhittin and Egloffstein, Marc and Bothe, Max and Rohloff, Tobias and Schenk, Nathanael and Schwerer, Florian and Ifenthaler, Dirk}, title = {Behavioral Patterns in Enterprise MOOCs at openSAP}, series = {EMOOCs 2021}, volume = {2021}, journal = {EMOOCs 2021}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-512-5}, doi = {10.25932/publishup-51735}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-517350}, pages = {281 -- 288}, year = {2021}, language = {en} } @article{OezdemirKurbanPekkan2021, author = {{\"O}zdemir, Paker Doğu and Kurban, Caroline Fell and Pekkan, Zelha Tun{\c{c}}}, title = {MOOC-Based Online Instruction}, series = {EMOOCs 2021}, volume = {2021}, journal = {EMOOCs 2021}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-512-5}, doi = {10.25932/publishup-51690}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-516900}, pages = {17 -- 33}, year = {2021}, abstract = {If taking a flipped learning approach, MOOC content can be used for online pre-class instruction. After which students can put the knowledge they gained from the MOOC into practice either synchronously or asynchronously. This study examined one such, asynchronous, course in teacher education. The course ran with 40 students over 13 weeks from February to May 2020. A case study approach was followed using mixed methods to assess the efficacy of the course. Quantitative data was gathered on achievement of learning outcomes, online engagement, and satisfaction. Qualitative data was gathered via student interviews from which a thematic analysis was undertaken. From a combined analysis of the data, three themes emerged as pertinent to course efficacy: quality and quantity of communication and collaboration; suitability of the MOOC; and significance for career development.}, language = {en} } @phdthesis{Zuo2017, author = {Zuo, Zhe}, title = {From unstructured to structured: Context-based named entity mining from text}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-412576}, school = {Universit{\"a}t Potsdam}, pages = {vii, 112}, year = {2017}, abstract = {With recent advances in the area of information extraction, automatically extracting structured information from a vast amount of unstructured textual data becomes an important task, which is infeasible for humans to capture all information manually. Named entities (e.g., persons, organizations, and locations), which are crucial components in texts, are usually the subjects of structured information from textual documents. Therefore, the task of named entity mining receives much attention. It consists of three major subtasks, which are named entity recognition, named entity linking, and relation extraction. These three tasks build up an entire pipeline of a named entity mining system, where each of them has its challenges and can be employed for further applications. As a fundamental task in the natural language processing domain, studies on named entity recognition have a long history, and many existing approaches produce reliable results. The task is aiming to extract mentions of named entities in text and identify their types. Named entity linking recently received much attention with the development of knowledge bases that contain rich information about entities. The goal is to disambiguate mentions of named entities and to link them to the corresponding entries in a knowledge base. Relation extraction, as the final step of named entity mining, is a highly challenging task, which is to extract semantic relations between named entities, e.g., the ownership relation between two companies. In this thesis, we review the state-of-the-art of named entity mining domain in detail, including valuable features, techniques, evaluation methodologies, and so on. Furthermore, we present two of our approaches that focus on the named entity linking and relation extraction tasks separately. To solve the named entity linking task, we propose the entity linking technique, BEL, which operates on a textual range of relevant terms and aggregates decisions from an ensemble of simple classifiers. Each of the classifiers operates on a randomly sampled subset of the above range. In extensive experiments on hand-labeled and benchmark datasets, our approach outperformed state-of-the-art entity linking techniques, both in terms of quality and efficiency. For the task of relation extraction, we focus on extracting a specific group of difficult relation types, business relations between companies. These relations can be used to gain valuable insight into the interactions between companies and perform complex analytics, such as predicting risk or valuating companies. Our semi-supervised strategy can extract business relations between companies based on only a few user-provided seed company pairs. By doing so, we also provide a solution for the problem of determining the direction of asymmetric relations, such as the ownership_of relation. We improve the reliability of the extraction process by using a holistic pattern identification method, which classifies the generated extraction patterns. Our experiments show that we can accurately and reliably extract new entity pairs occurring in the target relation by using as few as five labeled seed pairs.}, language = {en} } @book{ZhangPlauthEberhardtetal.2020, author = {Zhang, Shuhao and Plauth, Max and Eberhardt, Felix and Polze, Andreas and Lehmann, Jens and Sejdiu, Gezim and Jabeen, Hajira and Servadei, Lorenzo and M{\"o}stl, Christian and B{\"a}r, Florian and Netzeband, Andr{\´e} and Schmidt, Rainer and Knigge, Marlene and Hecht, Sonja and Prifti, Loina and Krcmar, Helmut and Sapegin, Andrey and Jaeger, David and Cheng, Feng and Meinel, Christoph and Friedrich, Tobias and Rothenberger, Ralf and Sutton, Andrew M. and Sidorova, Julia A. and Lundberg, Lars and Rosander, Oliver and Sk{\"o}ld, Lars and Di Varano, Igor and van der Walt, Est{\´e}e and Eloff, Jan H. P. and Fabian, Benjamin and Baumann, Annika and Ermakova, Tatiana and Kelkel, Stefan and Choudhary, Yash and Cooray, Thilini and Rodr{\´i}guez, Jorge and Medina-P{\´e}rez, Miguel Angel and Trejo, Luis A. and Barrera-Animas, Ari Yair and Monroy-Borja, Ra{\´u}l and L{\´o}pez-Cuevas, Armando and Ram{\´i}rez-M{\´a}rquez, Jos{\´e} Emmanuel and Grohmann, Maria and Niederleithinger, Ernst and Podapati, Sasidhar and Schmidt, Christopher and Huegle, Johannes and de Oliveira, Roberto C. L. and Soares, F{\´a}bio Mendes and van Hoorn, Andr{\´e} and Neumer, Tamas and Willnecker, Felix and Wilhelm, Mathias and Kuster, Bernhard}, title = {HPI Future SOC Lab - Proceedings 2017}, number = {130}, editor = {Meinel, Christoph and Polze, Andreas and Beins, Karsten and Strotmann, Rolf and Seibold, Ulrich and R{\"o}dszus, Kurt and M{\"u}ller, J{\"u}rgen}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-475-3}, issn = {1613-5652}, doi = {10.25932/publishup-43310}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-433100}, publisher = {Universit{\"a}t Potsdam}, pages = {ix, 235}, year = {2020}, abstract = {The "HPI Future SOC Lab" is a cooperation of the Hasso Plattner Institute (HPI) and industry partners. Its mission is to enable and promote exchange and interaction between the research community and the industry partners. The HPI Future SOC Lab provides researchers with free of charge access to a complete infrastructure of state of the art hard and software. This infrastructure includes components, which might be too expensive for an ordinary research environment, such as servers with up to 64 cores and 2 TB main memory. The offerings address researchers particularly from but not limited to the areas of computer science and business information systems. Main areas of research include cloud computing, parallelization, and In-Memory technologies. This technical report presents results of research projects executed in 2017. Selected projects have presented their results on April 25th and November 15th 2017 at the Future SOC Lab Day events.}, language = {en} } @phdthesis{Wolf2021, author = {Wolf, Johannes}, title = {Analysis and visualization of transport infrastructure based on large-scale geospatial mobile mapping data}, doi = {10.25932/publishup-53612}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-536129}, school = {Universit{\"a}t Potsdam}, pages = {vi, 121}, year = {2021}, abstract = {3D point clouds are a universal and discrete digital representation of three-dimensional objects and environments. For geospatial applications, 3D point clouds have become a fundamental type of raw data acquired and generated using various methods and techniques. In particular, 3D point clouds serve as raw data for creating digital twins of the built environment. This thesis concentrates on the research and development of concepts, methods, and techniques for preprocessing, semantically enriching, analyzing, and visualizing 3D point clouds for applications around transport infrastructure. It introduces a collection of preprocessing techniques that aim to harmonize raw 3D point cloud data, such as point density reduction and scan profile detection. Metrics such as, e.g., local density, verticality, and planarity are calculated for later use. One of the key contributions tackles the problem of analyzing and deriving semantic information in 3D point clouds. Three different approaches are investigated: a geometric analysis, a machine learning approach operating on synthetically generated 2D images, and a machine learning approach operating on 3D point clouds without intermediate representation. In the first application case, 2D image classification is applied and evaluated for mobile mapping data focusing on road networks to derive road marking vector data. The second application case investigates how 3D point clouds can be merged with ground-penetrating radar data for a combined visualization and to automatically identify atypical areas in the data. For example, the approach detects pavement regions with developing potholes. The third application case explores the combination of a 3D environment based on 3D point clouds with panoramic imagery to improve visual representation and the detection of 3D objects such as traffic signs. The presented methods were implemented and tested based on software frameworks for 3D point clouds and 3D visualization. In particular, modules for metric computation, classification procedures, and visualization techniques were integrated into a modular pipeline-based C++ research framework for geospatial data processing, extended by Python machine learning scripts. All visualization and analysis techniques scale to large real-world datasets such as road networks of entire cities or railroad networks. The thesis shows that some use cases allow taking advantage of established image vision methods to analyze images rendered from mobile mapping data efficiently. The two presented semantic classification methods working directly on 3D point clouds are use case independent and show similar overall accuracy when compared to each other. While the geometry-based method requires less computation time, the machine learning-based method supports arbitrary semantic classes but requires training the network with ground truth data. Both methods can be used in combination to gradually build this ground truth with manual corrections via a respective annotation tool. This thesis contributes results for IT system engineering of applications, systems, and services that require spatial digital twins of transport infrastructure such as road networks and railroad networks based on 3D point clouds as raw data. It demonstrates the feasibility of fully automated data flows that map captured 3D point clouds to semantically classified models. This provides a key component for seamlessly integrated spatial digital twins in IT solutions that require up-to-date, object-based, and semantically enriched information about the built environment.}, language = {en} } @article{WittigMirandaHoelzeretal.2022, author = {Wittig, Alice and Miranda, Fabio Malcher and H{\"o}lzer, Martin and Altenburg, Tom and Bartoszewicz, Jakub Maciej and Beyvers, Sebastian and Dieckmann, Marius Alfred and Genske, Ulrich and Giese, Sven Hans-Joachim and Nowicka, Melania and Richard, Hugues and Schiebenhoefer, Henning and Schmachtenberg, Anna-Juliane and Sieben, Paul and Tang, Ming and Tembrockhaus, Julius and Renard, Bernhard Y. and Fuchs, Stephan}, title = {CovRadar}, series = {Bioinformatics}, volume = {38}, journal = {Bioinformatics}, number = {17}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1367-4803}, doi = {10.1093/bioinformatics/btac411}, pages = {4223 -- 4225}, year = {2022}, abstract = {The ongoing pandemic caused by SARS-CoV-2 emphasizes the importance of genomic surveillance to understand the evolution of the virus, to monitor the viral population, and plan epidemiological responses. Detailed analysis, easy visualization and intuitive filtering of the latest viral sequences are powerful for this purpose. We present CovRadar, a tool for genomic surveillance of the SARS-CoV-2 Spike protein. CovRadar consists of an analytical pipeline and a web application that enable the analysis and visualization of hundreds of thousand sequences. First, CovRadar extracts the regions of interest using local alignment, then builds a multiple sequence alignment, infers variants and consensus and finally presents the results in an interactive app, making accessing and reporting simple, flexible and fast.}, language = {en} } @article{WiemkerBunovaNeufeldetal.2022, author = {Wiemker, Veronika and Bunova, Anna and Neufeld, Maria and Gornyi, Boris and Yurasova, Elena and Konigorski, Stefan and Kalinina, Anna and Kontsevaya, Anna and Ferreira-Borges, Carina and Probst, Charlotte}, title = {Pilot study to evaluate usability and acceptability of the 'Animated Alcohol Assessment Tool' in Russian primary healthcare}, series = {Digital health}, volume = {8}, journal = {Digital health}, publisher = {Sage Publications}, address = {London}, issn = {2055-2076}, doi = {10.1177/20552076211074491}, pages = {11}, year = {2022}, abstract = {Background and aims: Accurate and user-friendly assessment tools quantifying alcohol consumption are a prerequisite to effective prevention and treatment programmes, including Screening and Brief Intervention. Digital tools offer new potential in this field. We developed the 'Animated Alcohol Assessment Tool' (AAA-Tool), a mobile app providing an interactive version of the World Health Organization's Alcohol Use Disorders Identification Test (AUDIT) that facilitates the description of individual alcohol consumption via culturally informed animation features. This pilot study evaluated the Russia-specific version of the Animated Alcohol Assessment Tool with regard to (1) its usability and acceptability in a primary healthcare setting, (2) the plausibility of its alcohol consumption assessment results and (3) the adequacy of its Russia-specific vessel and beverage selection. Methods: Convenience samples of 55 patients (47\% female) and 15 healthcare practitioners (80\% female) in 2 Russian primary healthcare facilities self-administered the Animated Alcohol Assessment Tool and rated their experience on the Mobile Application Rating Scale - User Version. Usage data was automatically collected during app usage, and additional feedback on regional content was elicited in semi-structured interviews. Results: On average, patients completed the Animated Alcohol Assessment Tool in 6:38 min (SD = 2.49, range = 3.00-17.16). User satisfaction was good, with all subscale Mobile Application Rating Scale - User Version scores averaging >3 out of 5 points. A majority of patients (53\%) and practitioners (93\%) would recommend the tool to 'many people' or 'everyone'. Assessed alcohol consumption was plausible, with a low number (14\%) of logically impossible entries. Most patients reported the Animated Alcohol Assessment Tool to reflect all vessels (78\%) and all beverages (71\%) they typically used. Conclusion: High acceptability ratings by patients and healthcare practitioners, acceptable completion time, plausible alcohol usage assessment results and perceived adequacy of region-specific content underline the Animated Alcohol Assessment Tool's potential to provide a novel approach to alcohol assessment in primary healthcare. After its validation, the Animated Alcohol Assessment Tool might contribute to reducing alcohol-related harm by facilitating Screening and Brief Intervention implementation in Russia and beyond.}, language = {en} } @book{Weber2023, author = {Weber, Benedikt}, title = {Human pose estimation for decubitus prophylaxis}, number = {153}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-551-4}, issn = {1613-5652}, doi = {10.25932/publishup-56719}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-567196}, publisher = {Universit{\"a}t Potsdam}, pages = {73}, year = {2023}, abstract = {Decubitus is one of the most relevant diseases in nursing and the most expensive to treat. It is caused by sustained pressure on tissue, so it particularly affects bed-bound patients. This work lays a foundation for pressure mattress-based decubitus prophylaxis by implementing a solution to the single-frame 2D Human Pose Estimation problem. For this, methods of Deep Learning are employed. Two approaches are examined, a coarse-to-fine Convolutional Neural Network for direct regression of joint coordinates and a U-Net for the derivation of probability distribution heatmaps. We conclude that training our models on a combined dataset of the publicly available Bodies at Rest and SLP data yields the best results. Furthermore, various preprocessing techniques are investigated, and a hyperparameter optimization is performed to discover an improved model architecture. Another finding indicates that the heatmap-based approach outperforms direct regression. This model achieves a mean per-joint position error of 9.11 cm for the Bodies at Rest data and 7.43 cm for the SLP data. We find that it generalizes well on data from mattresses other than those seen during training but has difficulties detecting the arms correctly. Additionally, we give a brief overview of the medical data annotation tool annoto we developed in the bachelor project and furthermore conclude that the Scrum framework and agile practices enhanced our development workflow.}, language = {en} } @phdthesis{Vogel2018, author = {Vogel, Thomas}, title = {Model-driven engineering of self-adaptive software}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-409755}, school = {Universit{\"a}t Potsdam}, pages = {xvi, 357}, year = {2018}, abstract = {The development of self-adaptive software requires the engineering of an adaptation engine that controls the underlying adaptable software by a feedback loop. State-of-the-art approaches prescribe the feedback loop in terms of numbers, how the activities (e.g., monitor, analyze, plan, and execute (MAPE)) and the knowledge are structured to a feedback loop, and the type of knowledge. Moreover, the feedback loop is usually hidden in the implementation or framework and therefore not visible in the architectural design. Additionally, an adaptation engine often employs runtime models that either represent the adaptable software or capture strategic knowledge such as reconfiguration strategies. State-of-the-art approaches do not systematically address the interplay of such runtime models, which would otherwise allow developers to freely design the entire feedback loop. This thesis presents ExecUtable RuntimE MegAmodels (EUREMA), an integrated model-driven engineering (MDE) solution that rigorously uses models for engineering feedback loops. EUREMA provides a domain-specific modeling language to specify and an interpreter to execute feedback loops. The language allows developers to freely design a feedback loop concerning the activities and runtime models (knowledge) as well as the number of feedback loops. It further supports structuring the feedback loops in the adaptation engine that follows a layered architectural style. Thus, EUREMA makes the feedback loops explicit in the design and enables developers to reason about design decisions. To address the interplay of runtime models, we propose the concept of a runtime megamodel, which is a runtime model that contains other runtime models as well as activities (e.g., MAPE) working on the contained models. This concept is the underlying principle of EUREMA. The resulting EUREMA (mega)models are kept alive at runtime and they are directly executed by the EUREMA interpreter to run the feedback loops. Interpretation provides the flexibility to dynamically adapt a feedback loop. In this context, EUREMA supports engineering self-adaptive software in which feedback loops run independently or in a coordinated fashion within the same layer as well as on top of each other in different layers of the adaptation engine. Moreover, we consider preliminary means to evolve self-adaptive software by providing a maintenance interface to the adaptation engine. This thesis discusses in detail EUREMA by applying it to different scenarios such as single, multiple, and stacked feedback loops for self-repairing and self-optimizing the mRUBiS application. Moreover, it investigates the design and expressiveness of EUREMA, reports on experiments with a running system (mRUBiS) and with alternative solutions, and assesses EUREMA with respect to quality attributes such as performance and scalability. The conducted evaluation provides evidence that EUREMA as an integrated and open MDE approach for engineering self-adaptive software seamlessly integrates the development and runtime environments using the same formalism to specify and execute feedback loops, supports the dynamic adaptation of feedback loops in layered architectures, and achieves an efficient execution of feedback loops by leveraging incrementality.}, language = {en} } @article{VitaglianoHameedJiangetal.2023, author = {Vitagliano, Gerardo and Hameed, Mazhar and Jiang, Lan and Reisener, Lucas and Wu, Eugene and Naumann, Felix}, title = {Pollock: A Data Loading Benchmark}, series = {Proceedings of the VLDB Endowment}, volume = {16}, journal = {Proceedings of the VLDB Endowment}, number = {8}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3594512.3594518}, pages = {1870 -- 1882}, year = {2023}, abstract = {Any system at play in a data-driven project has a fundamental requirement: the ability to load data. The de-facto standard format to distribute and consume raw data is CSV. Yet, the plain text and flexible nature of this format make such files often difficult to parse and correctly load their content, requiring cumbersome data preparation steps. We propose a benchmark to assess the robustness of systems in loading data from non-standard CSV formats and with structural inconsistencies. First, we formalize a model to describe the issues that affect real-world files and use it to derive a systematic lpollutionz process to generate dialects for any given grammar. Our benchmark leverages the pollution framework for the csv format. To guide pollution, we have surveyed thousands of real-world, publicly available csv files, recording the problems we encountered. We demonstrate the applicability of our benchmark by testing and scoring 16 different systems: popular csv parsing frameworks, relational database tools, spreadsheet systems, and a data visualization tool.}, language = {en} } @phdthesis{Vitagliano2024, author = {Vitagliano, Gerardo}, title = {Modeling the structure of tabular files for data preparation}, doi = {10.25932/publishup-62435}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-624351}, school = {Universit{\"a}t Potsdam}, pages = {ii, 114}, year = {2024}, abstract = {To manage tabular data files and leverage their content in a given downstream task, practitioners often design and execute complex transformation pipelines to prepare them. The complexity of such pipelines stems from different factors, including the nature of the preparation tasks, often exploratory or ad-hoc to specific datasets; the large repertory of tools, algorithms, and frameworks that practitioners need to master; and the volume, variety, and velocity of the files to be prepared. Metadata plays a fundamental role in reducing this complexity: characterizing a file assists end users in the design of data preprocessing pipelines, and furthermore paves the way for suggestion, automation, and optimization of data preparation tasks. Previous research in the areas of data profiling, data integration, and data cleaning, has focused on extracting and characterizing metadata regarding the content of tabular data files, i.e., about the records and attributes of tables. Content metadata are useful for the latter stages of a preprocessing pipeline, e.g., error correction, duplicate detection, or value normalization, but they require a properly formed tabular input. Therefore, these metadata are not relevant for the early stages of a preparation pipeline, i.e., to correctly parse tables out of files. In this dissertation, we turn our focus to what we call the structure of a tabular data file, i.e., the set of characters within a file that do not represent data values but are required to parse and understand the content of the file. We provide three different approaches to represent file structure, an explicit representation based on context-free grammars; an implicit representation based on file-wise similarity; and a learned representation based on machine learning. In our first contribution, we use the grammar-based representation to characterize a set of over 3000 real-world csv files and identify multiple structural issues that let files deviate from the csv standard, e.g., by having inconsistent delimiters or containing multiple tables. We leverage our learnings about real-world files and propose Pollock, a benchmark to test how well systems parse csv files that have a non-standard structure, without any previous preparation. We report on our experiments on using Pollock to evaluate the performance of 16 real-world data management systems. Following, we characterize the structure of files implicitly, by defining a measure of structural similarity for file pairs. We design a novel algorithm to compute this measure, which is based on a graph representation of the files' content. We leverage this algorithm and propose Mondrian, a graphical system to assist users in identifying layout templates in a dataset, classes of files that have the same structure, and therefore can be prepared by applying the same preparation pipeline. Finally, we introduce MaGRiTTE, a novel architecture that uses self-supervised learning to automatically learn structural representations of files in the form of vectorial embeddings at three different levels: cell level, row level, and file level. We experiment with the application of structural embeddings for several tasks, namely dialect detection, row classification, and data preparation efforts estimation. Our experimental results show that structural metadata, either identified explicitly on parsing grammars, derived implicitly as file-wise similarity, or learned with the help of machine learning architectures, is fundamental to automate several tasks, to scale up preparation to large quantities of files, and to provide repeatable preparation pipelines.}, language = {en} } @book{vanderWaltOdunAyoBastianetal.2018, author = {van der Walt, Estee and Odun-Ayo, Isaac and Bastian, Matthias and Eldin Elsaid, Mohamed Esam}, title = {Proceedings of the Fifth HPI Cloud Symposium "Operating the Cloud" 2017}, number = {122}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-432-6}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-411330}, publisher = {Universit{\"a}t Potsdam}, pages = {70}, year = {2018}, abstract = {Every year, the Hasso Plattner Institute (HPI) invites guests from industry and academia to a collaborative scientific workshop on the topic Operating the Cloud. Our goal is to provide a forum for the exchange of knowledge and experience between industry and academia. Co-located with the event is the HPI's Future SOC Lab day, which offers an additional attractive and conducive environment for scientific and industry related discussions. Operating the Cloud aims to be a platform for productive interactions of innovative ideas, visions, and upcoming technologies in the field of cloud operation and administration. In these proceedings, the results of the fifth HPI cloud symposium Operating the Cloud 2017 are published. We thank the authors for exciting presentations and insights into their current work and research. Moreover, we look forward to more interesting submissions for the upcoming symposium in 2018.}, language = {en} } @article{UlrichLutfiRutzenetal.2022, author = {Ulrich, Jens-Uwe and Lutfi, Ahmad and Rutzen, Kilian and Renard, Bernhard Y.}, title = {ReadBouncer}, series = {Bioinformatics}, volume = {38}, journal = {Bioinformatics}, number = {SUPPL 1}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1367-4803}, doi = {10.1093/bioinformatics/btac223}, pages = {153 -- 160}, year = {2022}, abstract = {Motivation: Nanopore sequencers allow targeted sequencing of interesting nucleotide sequences by rejecting other sequences from individual pores. This feature facilitates the enrichment of low-abundant sequences by depleting overrepresented ones in-silico. Existing tools for adaptive sampling either apply signal alignment, which cannot handle human-sized reference sequences, or apply read mapping in sequence space relying on fast graphical processing units (GPU) base callers for real-time read rejection. Using nanopore long-read mapping tools is also not optimal when mapping shorter reads as usually analyzed in adaptive sampling applications. Results: Here, we present a new approach for nanopore adaptive sampling that combines fast CPU and GPU base calling with read classification based on Interleaved Bloom Filters. ReadBouncer improves the potential enrichment of low abundance sequences by its high read classification sensitivity and specificity, outperforming existing tools in the field. It robustly removes even reads belonging to large reference sequences while running on commodity hardware without GPUs, making adaptive sampling accessible for in-field researchers. Readbouncer also provides a user-friendly interface and installer files for end-users without a bioinformatics background.}, language = {en} } @article{TrautmannZhouBrahmsetal.2021, author = {Trautmann, Justin and Zhou, Lin and Brahms, Clemens Markus and Tunca, Can and Ersoy, Cem and Granacher, Urs and Arnrich, Bert}, title = {TRIPOD}, series = {Data : open access ʻData in scienceʼ journal}, volume = {6}, journal = {Data : open access ʻData in scienceʼ journal}, number = {9}, publisher = {MDPI}, address = {Basel}, issn = {2306-5729}, doi = {10.3390/data6090095}, pages = {19}, year = {2021}, abstract = {Inertial measurement units (IMUs) enable easy to operate and low-cost data recording for gait analysis. When combined with treadmill walking, a large number of steps can be collected in a controlled environment without the need of a dedicated gait analysis laboratory. In order to evaluate existing and novel IMU-based gait analysis algorithms for treadmill walking, a reference dataset that includes IMU data as well as reliable ground truth measurements for multiple participants and walking speeds is needed. This article provides a reference dataset consisting of 15 healthy young adults who walked on a treadmill at three different speeds. Data were acquired using seven IMUs placed on the lower body, two different reference systems (Zebris FDMT-HQ and OptoGait), and two RGB cameras. Additionally, in order to validate an existing IMU-based gait analysis algorithm using the dataset, an adaptable modular data analysis pipeline was built. Our results show agreement between the pressure-sensitive Zebris and the photoelectric OptoGait system (r = 0.99), demonstrating the quality of our reference data. As a use case, the performance of an algorithm originally designed for overground walking was tested on treadmill data using the data pipeline. The accuracy of stride length and stride time estimations was comparable to that reported in other studies with overground data, indicating that the algorithm is equally applicable to treadmill data. The Python source code of the data pipeline is publicly available, and the dataset will be provided by the authors upon request, enabling future evaluations of IMU gait analysis algorithms without the need of recording new data.}, language = {en} } @phdthesis{TorcatoMordido2021, author = {Torcato Mordido, Gon{\c{c}}alo Filipe}, title = {Diversification, compression, and evaluation methods for generative adversarial networks}, doi = {10.25932/publishup-53546}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-535460}, school = {Universit{\"a}t Potsdam}, pages = {xiii, 148}, year = {2021}, abstract = {Generative adversarial networks (GANs) have been broadly applied to a wide range of application domains since their proposal. In this thesis, we propose several methods that aim to tackle different existing problems in GANs. Particularly, even though GANs are generally able to generate high-quality samples, the diversity of the generated set is often sub-optimal. Moreover, the common increase of the number of models in the original GANs framework, as well as their architectural sizes, introduces additional costs. Additionally, even though challenging, the proper evaluation of a generated set is an important direction to ultimately improve the generation process in GANs. We start by introducing two diversification methods that extend the original GANs framework to multiple adversaries to stimulate sample diversity in a generated set. Then, we introduce a new post-training compression method based on Monte Carlo methods and importance sampling to quantize and prune the weights and activations of pre-trained neural networks without any additional training. The previous method may be used to reduce the memory and computational costs introduced by increasing the number of models in the original GANs framework. Moreover, we use a similar procedure to quantize and prune gradients during training, which also reduces the communication costs between different workers in a distributed training setting. We introduce several topology-based evaluation methods to assess data generation in different settings, namely image generation and language generation. Our methods retrieve both single-valued and double-valued metrics, which, given a real set, may be used to broadly assess a generated set or separately evaluate sample quality and sample diversity, respectively. Moreover, two of our metrics use locality-sensitive hashing to accurately assess the generated sets of highly compressed GANs. The analysis of the compression effects in GANs paves the way for their efficient employment in real-world applications. Given their general applicability, the methods proposed in this thesis may be extended beyond the context of GANs. Hence, they may be generally applied to enhance existing neural networks and, in particular, generative frameworks.}, language = {en} } @article{TopaliChountaOrtegaArranzetal.2021, author = {Topali, Paraskevi and Chounta, Irene-Angelica and Ortega-Arranz, Alejandro and Villagr{\´a}-Sobrino, Sara L. and Mart{\´i}nez-Mon{\´e}s, Alejandra}, title = {CoFeeMOOC-v.2}, series = {EMOOCs 2021}, volume = {2021}, journal = {EMOOCs 2021}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-512-5}, doi = {10.25932/publishup-51724}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-517241}, pages = {209 -- 217}, year = {2021}, abstract = {Providing adequate support to MOOC participants is often a challenging task due to massiveness of the learners' population and the asynchronous communication among peers and MOOC practitioners. This workshop aims at discussing common learners' problems reported in the literature and reflect on designing adequate feedback interventions with the use of learning data. Our aim is three-fold: a) to pinpoint MOOC aspects that impact the planning of feedback, b) to explore the use of learning data in designing feedback strategies, and c) to propose design guidelines for developing and delivering scaffolding interventions for personalized feedback in MOOCs. To do so, we will carry out hands-on activities that aim to involve participants in interpreting learning data and using them to design adaptive feedback. This workshop appeals to researchers, practitioners and MOOC stakeholders who aim to providing contextualized scaffolding. We envision that this workshop will provide insights for bridging the gap between pedagogical theory and practice when it comes to feedback interventions in MOOCs.}, language = {en} } @phdthesis{Taeumel2020, author = {Taeumel, Marcel}, title = {Data-driven tool construction in exploratory programming environments}, doi = {10.25932/publishup-44428}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-444289}, school = {Universit{\"a}t Potsdam}, pages = {xiv, 299}, year = {2020}, abstract = {This work presents a new design for programming environments that promote the exploration of domain-specific software artifacts and the construction of graphical tools for such program comprehension tasks. In complex software projects, tool building is essential because domain- or task-specific tools can support decision making by representing concerns concisely with low cognitive effort. In contrast, generic tools can only support anticipated scenarios, which usually align with programming language concepts or well-known project domains. However, the creation and modification of interactive tools is expensive because the glue that connects data to graphics is hard to find, change, and test. Even if valuable data is available in a common format and even if promising visualizations could be populated, programmers have to invest many resources to make changes in the programming environment. Consequently, only ideas of predictably high value will be implemented. In the non-graphical, command-line world, the situation looks different and inspiring: programmers can easily build their own tools as shell scripts by configuring and combining filter programs to process data. We propose a new perspective on graphical tools and provide a concept to build and modify such tools with a focus on high quality, low effort, and continuous adaptability. That is, (1) we propose an object-oriented, data-driven, declarative scripting language that reduces the amount of and governs the effects of glue code for view-model specifications, and (2) we propose a scalable UI-design language that promotes short feedback loops in an interactive, graphical environment such as Morphic known from Self or Squeak/Smalltalk systems. We implemented our concept as a tool building environment, which we call VIVIDE, on top of Squeak/Smalltalk and Morphic. We replaced existing code browsing and debugging tools to iterate within our solution more quickly. In several case studies with undergraduate and graduate students, we observed that VIVIDE can be applied to many domains such as live language development, source-code versioning, modular code browsing, and multi-language debugging. Then, we designed a controlled experiment to measure the effect on the time to build tools. Several pilot runs showed that training is crucial and, presumably, takes days or weeks, which implies a need for further research. As a result, programmers as users can directly work with tangible representations of their software artifacts in the VIVIDE environment. Tool builders can write domain-specific scripts to populate views to approach comprehension tasks from different angles. Our novel perspective on graphical tools can inspire the creation of new trade-offs in modularity for both data providers and view designers.}, language = {en} } @phdthesis{Shekhar2023, author = {Shekhar, Sumit}, title = {Image and video processing based on intrinsic attributes}, doi = {10.25932/publishup-62004}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-620049}, school = {Universit{\"a}t Potsdam}, pages = {xii, 143}, year = {2023}, abstract = {Advancements in computer vision techniques driven by machine learning have facilitated robust and efficient estimation of attributes such as depth, optical flow, albedo, and shading. To encapsulate all such underlying properties associated with images and videos, we evolve the concept of intrinsic images towards intrinsic attributes. Further, rapid hardware growth in the form of high-quality smartphone cameras, readily available depth sensors, mobile GPUs, or dedicated neural processing units have made image and video processing pervasive. In this thesis, we explore the synergies between the above two advancements and propose novel image and video processing techniques and systems based on them. To begin with, we investigate intrinsic image decomposition approaches and analyze how they can be implemented on mobile devices. We propose an approach that considers not only diffuse reflection but also specular reflection; it allows us to decompose an image into specularity, albedo, and shading on a resource constrained system (e.g., smartphones or tablets) using the depth data provided by the built-in depth sensors. In addition, we explore how on-device depth data can further be used to add an immersive dimension to 2D photos, e.g., showcasing parallax effects via 3D photography. In this regard, we develop a novel system for interactive 3D photo generation and stylization on mobile devices. Further, we investigate how adaptive manipulation of baseline-albedo (i.e., chromaticity) can be used for efficient visual enhancement under low-lighting conditions. The proposed technique allows for interactive editing of enhancement settings while achieving improved quality and performance. We analyze the inherent optical flow and temporal noise as intrinsic properties of a video. We further propose two new techniques for applying the above intrinsic attributes for the purpose of consistent video filtering. To this end, we investigate how to remove temporal inconsistencies perceived as flickering artifacts. One of the techniques does not require costly optical flow estimation, while both provide interactive consistency control. Using intrinsic attributes for image and video processing enables new solutions for mobile devices - a pervasive visual computing device - and will facilitate novel applications for Augmented Reality (AR), 3D photography, and video stylization. The proposed low-light enhancement techniques can also improve the accuracy of high-level computer vision tasks (e.g., face detection) under low-light conditions. Finally, our approach for consistent video filtering can extend a wide range of image-based processing for videos.}, language = {en} } @article{SengCarlonGayedetal.2021, author = {Seng, Cheyvuth and Carlon, May Kristine Jonson and Gayed, John Maurice and Cross, Jeffrey S.}, title = {Long-Term Effects of Short-Term Intervention Using MOOCs for Developing Cambodian Undergraduate Research Skills}, series = {EMOOCs 2021}, volume = {2021}, journal = {EMOOCs 2021}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, doi = {10.25932/publishup-51692}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-516929}, pages = {49 -- 62}, year = {2021}, abstract = {Developing highly skilled researchers is essential to accelerate the economic progress of developing countries such as Cambodia in South East Asia. While there is continuing research investigating Cambodia's potential to cultivate such a workforce, the circumstances of undergraduate students in public provincial universities do not receive ample attention. This is crucial as numerous multinational corporations are participating via foreign direct investments in special economic zones at the border provinces and need talented human resources in Cambodia as well as in neighboring Southeast Asian countries such as Thailand and Vietnam. Student's research capability growth starts with one's belief in their capacity to use the necessary information tools and their potential to succeed in research. In this research paper, we look at how such beliefs, specifically research self-efficacy and information literacy, can be developed through a short-term intervention that uses MOOCs and assess their long-term effects. Our previous research has shown that short-term training intervention has immediate positive effects on the undergraduate students' self-efficacies in Cambodian public provincial universities. In this paper, we present the follow-up study results conducted sixteen months after the said short-term training intervention. Results reveal that from follow-up evaluations that while student's self-efficacies were significantly higher than before the short-term intervention was completed, they were lower than immediately after the intervention. Thus, while perfunctory interventions such as merely introducing the students to MOOCs and other relevant research tools over as little as three weeks can have significant positive effects, efforts must be made to sustain the benefits gained. This implication is essential to developing countries such as Cambodia that need low-cost solutions with immediate positive results in developing human resources to conduct research, particularly in areas far from more developed capital cities.}, language = {en} } @book{SeitzLinckeReinetal.2021, author = {Seitz, Klara and Lincke, Jens and Rein, Patrick and Hirschfeld, Robert}, title = {Language and tool support for 3D crochet patterns}, number = {137}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-505-7}, issn = {1613-5652}, doi = {10.25932/publishup-49253}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-492530}, publisher = {Universit{\"a}t Potsdam}, pages = {vii, 94}, year = {2021}, abstract = {Crochet is a popular handcraft all over the world. While other techniques such as knitting or weaving have received technical support over the years through machines, crochet is still a purely manual craft. Not just the act of crochet itself is manual but also the process of creating instructions for new crochet patterns, which is barely supported by domain specific digital solutions. This leads to unstructured and often also ambiguous and erroneous pattern instructions. In this report, we propose a concept to digitally represent crochet patterns. This format incorporates crochet techniques which allows domain specific support for crochet pattern designers during the pattern creation and instruction writing process. As contributions, we present a thorough domain analysis, the concept of a graph structure used as domain specific language to specify crochet patterns and a prototype of a projectional editor using the graph as representation format of patterns and a diagramming system to visualize them in 2D and 3D. By analyzing the domain, we learned about crochet techniques and pain points of designers in their pattern creation workflow. These insights are the basis on which we defined the pattern representation. In order to evaluate our concept, we built a prototype by which the feasibility of the concept is shown and we tested the software with professional crochet designers who approved of the concept.}, language = {en} }