@book{OPUS4-7534, title = {Process design for natural scientists}, series = {Communications in computer and information science ; 500}, journal = {Communications in computer and information science ; 500}, editor = {Lambrecht, Anna-Lena and Margaria, Tizian}, publisher = {Springer}, address = {Wiesbaden}, isbn = {978-3-662-45005-5}, pages = {X, 251}, year = {2014}, abstract = {This book presents an agile and model-driven approach to manage scientific workflows. The approach is based on the Extreme Model Driven Design (XMDD) paradigm and aims at simplifying and automating the complex data analysis processes carried out by scientists in their day-to-day work. Besides documenting the impact the workflow modeling might have on the work of natural scientists, this book serves three major purposes: 1. It acts as a primer for practitioners who are interested to learn how to think in terms of services and workflows when facing domain-specific scientific processes. 2. It provides interesting material for readers already familiar with this kind of tools, because it introduces systematically both the technologies used in each case study and the basic concepts behind them. 3. As the addressed thematic field becomes increasingly relevant for lectures in both computer science and experimental sciences, it also provides helpful material for teachers that plan similar courses.}, language = {en} } @article{Teske2014, author = {Teske, Daniel}, title = {Geocoder accuracy ranking}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {161 -- 174}, year = {2014}, abstract = {Finding an address on a map is sometimes tricky: the chosen map application may be unfamiliar with the enclosed region. There are several geocoders on the market, they have different databases and algorithms to compute the query. Consequently, the geocoding results differ in their quality. Fortunately the geocoders provide a rich set of metadata. The workflow described in this paper compares this metadata with the aim to find out which geocoder is offering the best-fitting coordinate for a given address.}, language = {en} } @article{Sens2014, author = {Sens, Henriette}, title = {Web-Based map generalization tools put to the test: a jABC workflow}, series = {Process Design for Natural Scientists: an agile model-driven approach}, journal = {Process Design for Natural Scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {175 -- 185}, year = {2014}, abstract = {Geometric generalization is a fundamental concept in the digital mapping process. An increasing amount of spatial data is provided on the web as well as a range of tools to process it. This jABC workflow is used for the automatic testing of web-based generalization services like mapshaper.org by executing its functionality, overlaying both datasets before and after the transformation and displaying them visually in a .tif file. Mostly Web Services and command line tools are used to build an environment where ESRI shapefiles can be uploaded, processed through a chosen generalization service and finally visualized in Irfanview.}, language = {en} } @article{Noack2014, author = {Noack, Franziska}, title = {CREADED: Colored-Relief application for digital elevation data}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {186 -- 199}, year = {2014}, abstract = {In the geoinformatics field, remote sensing data is often used for analyzing the characteristics of the current investigation area. This includes DEMs, which are simple raster grids containing grey scales representing the respective elevation values. The project CREADED that is presented in this paper aims at making these monochrome raster images more significant and more intuitively interpretable. For this purpose, an executable interactive model for creating a colored and relief-shaded Digital Elevation Model (DEM) has been designed using the jABC framework. The process is based on standard jABC-SIBs and SIBs that provide specific GIS functions, which are available as Web services, command line tools and scripts.}, language = {en} } @article{Respondek2014, author = {Respondek, Tobias}, title = {A workflow for computing potential areas for wind turbines}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, pages = {200 -- 215}, year = {2014}, abstract = {This paper describes the implementation of a workflow model for service-oriented computing of potential areas for wind turbines in jABC. By implementing a re-executable model the manual effort of a multi-criteria site analysis can be reduced. The aim is to determine the shift of typical geoprocessing tools of geographic information systems (GIS) from the desktop to the web. The analysis is based on a vector data set and mainly uses web services of the "Center for Spatial Information Science and Systems" (CSISS). This paper discusses effort, benefits and problems associated with the use of the web services.}, language = {en} } @article{Scheele2014, author = {Scheele, Lasse}, title = {Location analysis for placing artificial reefs}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {216 -- 228}, year = {2014}, abstract = {Location analyses are among the most common tasks while working with spatial data and geographic information systems. Automating the most frequently used procedures is therefore an important aspect of improving their usability. In this context, this project aims to design and implement a workflow, providing some basic tools for a location analysis. For the implementation with jABC, the workflow was applied to the problem of finding a suitable location for placing an artificial reef. For this analysis three parameters (bathymetry, slope and grain size of the ground material) were taken into account, processed, and visualized with the The Generic Mapping Tools (GMT), which were integrated into the workflow as jETI-SIBs. The implemented workflow thereby showed that the approach to combine jABC with GMT resulted in an user-centric yet user-friendly tool with high-quality cartographic outputs.}, language = {en} } @article{Kind2014, author = {Kind, Josephine}, title = {Creation of topographic maps}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, pages = {229 -- 238}, year = {2014}, abstract = {Location analyses are among the most common tasks while working with spatial data and geographic information systems. Automating the most frequently used procedures is therefore an important aspect of improving their usability. In this context, this project aims to design and implement a workflow, providing some basic tools for a location analysis. For the implementation with jABC, the workflow was applied to the problem of finding a suitable location for placing an artificial reef. For this analysis three parameters (bathymetry, slope and grain size of the ground material) were taken into account, processed, and visualized with the The Generic Mapping Tools (GMT), which were integrated into the workflow as jETI-SIBs. The implemented workflow thereby showed that the approach to combine jABC with GMT resulted in an user-centric yet user-friendly tool with high-quality cartographic outputs.}, language = {en} } @article{Holler2014, author = {Holler, Robin}, title = {GraffDok - a graffiti documentation application}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {239 -- 251}, year = {2014}, abstract = {GraffDok is an application helping to maintain an overview over sprayed images somewhere in a city. At the time of writing it aims at vandalism rather than at beautiful photographic graffiti in an underpass. Looking at hundreds of tags and scribbles on monuments, house walls, etc. it would be interesting to not only record them in writing but even make them accessible electronically, including images. GraffDok's workflow is simple and only requires an EXIF-GPS-tagged photograph of a graffito. It automatically determines its location by using reverse geocoding with the given GPS-coordinates and the Gisgraphy WebService. While asking the user for some more meta data, GraffDok analyses the image in parallel with this and tries to detect fore- and background - before extracting the drawing lines and make them stand alone. The command line based tool ImageMagick is used here as well as for accessing EXIF data. Any meta data is written to csv-files, which will stay easily accessible and can be integrated in TeX-files as well. The latter ones are converted to PDF at the end of the workflow, containing a table about all graffiti and a summary for each - including the generated characteristic graffiti pattern image.}, language = {en} } @article{Reso2014, author = {Reso, Judith}, title = {Protein Classification Workflow}, series = {Process Design for Natural Scientists: an agile model-driven approach}, journal = {Process Design for Natural Scientists: an agile model-driven approach}, number = {500}, editor = {Lambrecht, Anna-Lena and Margaria, Tiziana}, publisher = {Springer Verlag}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {65 -- 72}, year = {2014}, abstract = {The protein classification workflow described in this report enables users to get information about a novel protein sequence automatically. The information is derived by different bioinformatic analysis tools which calculate or predict features of a protein sequence. Also, databases are used to compare the novel sequence with known proteins.}, language = {en} } @article{Schulze2014, author = {Schulze, Gunnar}, title = {Workflow for rapid metagenome analysis}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {88 -- 100}, year = {2014}, abstract = {Analyses of metagenomes in life sciences present new opportunities as well as challenges to the scientific community and call for advanced computational methods and workflows. The large amount of data collected from samples via next-generation sequencing (NGS) technologies render manual approaches to sequence comparison and annotation unsuitable. Rather, fast and efficient computational pipelines are needed to provide comprehensive statistics and summaries and enable the researcher to choose appropriate tools for more specific analyses. The workflow presented here builds upon previous pipelines designed for automated clustering and annotation of raw sequence reads obtained from next-generation sequencing technologies such as 454 and Illumina. Employing specialized algorithms, the sequence reads are processed at three different levels. First, raw reads are clustered at high similarity cutoff to yield clusters which can be exported as multifasta files for further analyses. Independently, open reading frames (ORFs) are predicted from raw reads and clustered at two strictness levels to yield sets of non-redundant sequences and ORF families. Furthermore, single ORFs are annotated by performing searches against the Pfam database}, language = {en} } @article{Vierheller2014, author = {Vierheller, Janine}, title = {Exploratory Data Analysis}, series = {Process Design for Natural Scientists: an agile model-driven approach}, journal = {Process Design for Natural Scientists: an agile model-driven approach}, number = {500}, editor = {Lambrecht, Anna-Lena and Margaria, Tiziana}, publisher = {Axel Springer Verlag}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {110 -- 126}, year = {2014}, abstract = {In bioinformatics the term exploratory data analysis refers to different methods to get an overview of large biological data sets. Hence, it helps to create a framework for further analysis and hypothesis testing. The workflow facilitates this first important step of the data analysis created by high-throughput technologies. The results are different plots showing the structure of the measurements. The goal of the workflow is the automatization of the exploratory data analysis, but also the flexibility should be guaranteed. The basic tool is the free software R.}, language = {en} } @article{Schuett2014, author = {Sch{\"u}tt, Christine}, title = {Identification of differentially expressed genes}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {127 -- 139}, year = {2014}, abstract = {With the jABC it is possible to realize workflows for numerous questions in different fields. The goal of this project was to create a workflow for the identification of differentially expressed genes. This is of special interest in biology, for it gives the opportunity to get a better insight in cellular changes due to exogenous stress, diseases and so on. With the knowledge that can be derived from the differentially expressed genes in diseased tissues, it becomes possible to find new targets for treatment.}, language = {en} } @article{Kuntzsch2014, author = {Kuntzsch, Christian}, title = {Visualization of data transfer paths}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {140 -- 148}, year = {2014}, abstract = {A workflow for visualizing server connections using the Google Maps API was built in the jABC. It makes use of three basic services: An XML-based IP address geolocation web service, a command line tool and the Static Maps API. The result of the workflow is an URL leading to an image file of a map, showing server connections between a client and a target host.}, language = {en} } @article{Hibbe2014, author = {Hibbe, Marcel}, title = {Spotlocator - Guess Where the Photo Was Taken!}, series = {Process Design for Natural Scientists: an agile model-driven approach}, journal = {Process Design for Natural Scientists: an agile model-driven approach}, number = {500}, editor = {Lambrecht, Anna-Lena and Margaria, Tiziana}, publisher = {Springer Verlag}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {149 -- 160}, year = {2014}, abstract = {Spotlocator is a game wherein people have to guess the spots of where photos were taken. The photos of a defined area for each game are from panoramio.com. They are published at http://spotlocator. drupalgardens.com with an ID. Everyone can guess the photo spots by sending a special tweet via Twitter that contains the hashtag \#spotlocator, the guessed coordinates and the ID of the photo. An evaluation is published for all tweets. The players are informed about the distance to the real photo spots and the positions are shown on a map.}, language = {en} } @article{Blaese2014, author = {Blaese, Leif}, title = {Data mining for unidentified protein squences}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {73 -- 87}, year = {2014}, abstract = {Through the use of next generation sequencing (NGS) technology, a lot of newly sequenced organisms are now available. Annotating those genes is one of the most challenging tasks in sequence biology. Here, we present an automated workflow to find homologue proteins, annotate sequences according to function and create a three-dimensional model.}, language = {en} } @article{Lis2014, author = {Lis, Monika}, title = {Constructing a Phylogenetic Tree}, series = {Process Design for Natural Scientists: an agile model-driven approach}, journal = {Process Design for Natural Scientists: an agile model-driven approach}, number = {500}, editor = {Lambrecht, Anna-Lena and Margaria, Tiziana}, publisher = {Springer Verlag}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {101 -- 109}, year = {2014}, abstract = {In this project I constructed a workflow that takes a DNA sequence as input and provides a phylogenetic tree, consisting of the input sequence and other sequences which were found during a database search. In this phylogenetic tree the sequences are arranged depending on similarities. In bioinformatics, constructing phylogenetic trees is often used to explore the evolutionary relationships of genes or organisms and to understand the mechanisms of evolution itself.}, language = {en} } @phdthesis{Heise2014, author = {Heise, Arvid}, title = {Data cleansing and integration operators for a parallel data analytics platform}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-77100}, school = {Universit{\"a}t Potsdam}, pages = {ii, 179}, year = {2014}, abstract = {The data quality of real-world datasets need to be constantly monitored and maintained to allow organizations and individuals to reliably use their data. Especially, data integration projects suffer from poor initial data quality and as a consequence consume more effort and money. Commercial products and research prototypes for data cleansing and integration help users to improve the quality of individual and combined datasets. They can be divided into either standalone systems or database management system (DBMS) extensions. On the one hand, standalone systems do not interact well with DBMS and require time-consuming data imports and exports. On the other hand, DBMS extensions are often limited by the underlying system and do not cover the full set of data cleansing and integration tasks. We overcome both limitations by implementing a concise set of five data cleansing and integration operators on the parallel data analytics platform Stratosphere. We define the semantics of the operators, present their parallel implementation, and devise optimization techniques for individual operators and combinations thereof. Users specify declarative queries in our query language METEOR with our new operators to improve the data quality of individual datasets or integrate them to larger datasets. By integrating the data cleansing operators into the higher level language layer of Stratosphere, users can easily combine cleansing operators with operators from other domains, such as information extraction, to complex data flows. Through a generic description of the operators, the Stratosphere optimizer reorders operators even from different domains to find better query plans. As a case study, we reimplemented a part of the large Open Government Data integration project GovWILD with our new operators and show that our queries run significantly faster than the original GovWILD queries, which rely on relational operators. Evaluation reveals that our operators exhibit good scalability on up to 100 cores, so that even larger inputs can be efficiently processed by scaling out to more machines. Finally, our scripts are considerably shorter than the original GovWILD scripts, which results in better maintainability of the scripts.}, language = {en} } @misc{Voland2014, type = {Master Thesis}, author = {Voland, Patrick}, title = {Webbasierte Visualisierung von Extended Floating Car Data (XFCD)}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-96751}, school = {Universit{\"a}t Potsdam}, pages = {VIII, 176}, year = {2014}, abstract = {Moderne Kraftfahrzeuge verf{\"u}gen {\"u}ber eine Vielzahl an Sensoren, welche f{\"u}r einen reibungslosen technischen Betrieb ben{\"o}tigt werden. Hierzu z{\"a}hlen neben fahrzeugspezifischen Sensoren (wie z.B. Motordrehzahl und Fahrzeuggeschwindigkeit) auch umweltspezifische Sensoren (wie z.B. Luftdruck und Umgebungstemperatur). Durch die zunehmende technische Vernetzung wird es m{\"o}glich, diese Daten der Kraftfahrzeugelektronik aus dem Fahrzeug heraus f{\"u}r die verschiedensten Zwecke zu verwenden. Die vorliegende Arbeit soll einen Beitrag dazu leisten, diese neue Art an massenhaften Daten im Sinne des Konzepts der „Extended Floating Car Data" (XFCD) als Geoinformationen nutzbar zu machen und diese f{\"u}r raumzeitliche Visualisierungen (zur visuellen Analyse) anwenden zu k{\"o}nnen. In diesem Zusammenhang wird speziell die Perspektive des Umwelt- und Verkehrsmonitoring betrachtet, wobei die Anforderungen und Potentiale mit Hilfe von Experteninterviews untersucht werden. Es stellt sich die Frage, welche Daten durch die Kraftfahrzeugelektronik geliefert und wie diese m{\"o}glichst automatisiert erfasst, verarbeitet, visualisiert und {\"o}ffentlich bereitgestellt werden k{\"o}nnen. Neben theoretischen und technischen Grundlagen zur Datenerfassung und -nutzung liegt der Fokus auf den Methoden der kartographischen Visualisierung. Dabei soll der Frage nachgegangenen werden, ob eine technische Implementierung ausschließlich unter Verwendung von Open Source Software m{\"o}glich ist. Das Ziel der Arbeit bildet ein zweigliedriger Ansatz, welcher zum einen die Visualisierung f{\"u}r ein exemplarisch gew{\"a}hltes Anwendungsszenario und zum anderen die prototypische Implementierung von der Datenerfassung im Fahrzeug unter Verwendung der gesetzlich vorgeschriebenen „On Board Diagnose"-Schnittstelle und einem Smartphone-gest{\"u}tzten Ablauf bis zur webbasierten Visualisierung umfasst.}, language = {de} } @phdthesis{Lorey2014, author = {Lorey, Johannes}, title = {What's in a query : analyzing, predicting, and managing linked data access}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-72312}, school = {Universit{\"a}t Potsdam}, year = {2014}, abstract = {The term Linked Data refers to connected information sources comprising structured data about a wide range of topics and for a multitude of applications. In recent years, the conceptional and technical foundations of Linked Data have been formalized and refined. To this end, well-known technologies have been established, such as the Resource Description Framework (RDF) as a Linked Data model or the SPARQL Protocol and RDF Query Language (SPARQL) for retrieving this information. Whereas most research has been conducted in the area of generating and publishing Linked Data, this thesis presents novel approaches for improved management. In particular, we illustrate new methods for analyzing and processing SPARQL queries. Here, we present two algorithms suitable for identifying structural relationships between these queries. Both algorithms are applied to a large number of real-world requests to evaluate the performance of the approaches and the quality of their results. Based on this, we introduce different strategies enabling optimized access of Linked Data sources. We demonstrate how the presented approach facilitates effective utilization of SPARQL endpoints by prefetching results relevant for multiple subsequent requests. Furthermore, we contribute a set of metrics for determining technical characteristics of such knowledge bases. To this end, we devise practical heuristics and validate them through thorough analysis of real-world data sources. We discuss the findings and evaluate their impact on utilizing the endpoints. Moreover, we detail the adoption of a scalable infrastructure for improving Linked Data discovery and consumption. As we outline in an exemplary use case, this platform is eligible both for processing and provisioning the corresponding information.}, language = {en} } @phdthesis{Steinert2014, author = {Steinert, Bastian}, title = {Built-in recovery support for explorative programming}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-71305}, school = {Universit{\"a}t Potsdam}, year = {2014}, abstract = {This work introduces concepts and corresponding tool support to enable a complementary approach in dealing with recovery. Programmers need to recover a development state, or a part thereof, when previously made changes reveal undesired implications. However, when the need arises suddenly and unexpectedly, recovery often involves expensive and tedious work. To avoid tedious work, literature recommends keeping away from unexpected recovery demands by following a structured and disciplined approach, which consists of the application of various best practices including working only on one thing at a time, performing small steps, as well as making proper use of versioning and testing tools. However, the attempt to avoid unexpected recovery is both time-consuming and error-prone. On the one hand, it requires disproportionate effort to minimize the risk of unexpected situations. On the other hand, applying recommended practices selectively, which saves time, can hardly avoid recovery. In addition, the constant need for foresight and self-control has unfavorable implications. It is exhaustive and impedes creative problem solving. This work proposes to make recovery fast and easy and introduces corresponding support called CoExist. Such dedicated support turns situations of unanticipated recovery from tedious experiences into pleasant ones. It makes recovery fast and easy to accomplish, even if explicit commits are unavailable or tests have been ignored for some time. When mistakes and unexpected insights are no longer associated with tedious corrective actions, programmers are encouraged to change source code as a means to reason about it, as opposed to making changes only after structuring and evaluating them mentally. This work further reports on an implementation of the proposed tool support in the Squeak/Smalltalk development environment. The development of the tools has been accompanied by regular performance and usability tests. In addition, this work investigates whether the proposed tools affect programmers' performance. In a controlled lab study, 22 participants improved the design of two different applications. Using a repeated measurement setup, the study examined the effect of providing CoExist on programming performance. The result of analyzing 88 hours of programming suggests that built-in recovery support as provided with CoExist positively has a positive effect on programming performance in explorative programming tasks.}, language = {en} }