@phdthesis{Heise2014, author = {Heise, Arvid}, title = {Data cleansing and integration operators for a parallel data analytics platform}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-77100}, school = {Universit{\"a}t Potsdam}, pages = {ii, 179}, year = {2014}, abstract = {The data quality of real-world datasets need to be constantly monitored and maintained to allow organizations and individuals to reliably use their data. Especially, data integration projects suffer from poor initial data quality and as a consequence consume more effort and money. Commercial products and research prototypes for data cleansing and integration help users to improve the quality of individual and combined datasets. They can be divided into either standalone systems or database management system (DBMS) extensions. On the one hand, standalone systems do not interact well with DBMS and require time-consuming data imports and exports. On the other hand, DBMS extensions are often limited by the underlying system and do not cover the full set of data cleansing and integration tasks. We overcome both limitations by implementing a concise set of five data cleansing and integration operators on the parallel data analytics platform Stratosphere. We define the semantics of the operators, present their parallel implementation, and devise optimization techniques for individual operators and combinations thereof. Users specify declarative queries in our query language METEOR with our new operators to improve the data quality of individual datasets or integrate them to larger datasets. By integrating the data cleansing operators into the higher level language layer of Stratosphere, users can easily combine cleansing operators with operators from other domains, such as information extraction, to complex data flows. Through a generic description of the operators, the Stratosphere optimizer reorders operators even from different domains to find better query plans. As a case study, we reimplemented a part of the large Open Government Data integration project GovWILD with our new operators and show that our queries run significantly faster than the original GovWILD queries, which rely on relational operators. Evaluation reveals that our operators exhibit good scalability on up to 100 cores, so that even larger inputs can be efficiently processed by scaling out to more machines. Finally, our scripts are considerably shorter than the original GovWILD scripts, which results in better maintainability of the scripts.}, language = {en} } @article{Blaese2014, author = {Blaese, Leif}, title = {Data mining for unidentified protein squences}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {73 -- 87}, year = {2014}, abstract = {Through the use of next generation sequencing (NGS) technology, a lot of newly sequenced organisms are now available. Annotating those genes is one of the most challenging tasks in sequence biology. Here, we present an automated workflow to find homologue proteins, annotate sequences according to function and create a three-dimensional model.}, language = {en} } @misc{HaegeleSchlagenhaufRappetal.2014, author = {H{\"a}gele, Claudia and Schlagenhauf, Florian and Rapp, Michael A. and Sterzer, Philipp and Beck, Anne and Bermpohl, Felix and Stoy, Meline and Str{\"o}hle, Andreas and Wittchen, Hans-Ulrich and Dolan, Raymond J. and Heinz, Andreas}, title = {Dimensional psychiatry}, series = {Postprints der Universit{\"a}t Potsdam : Humanwissenschaftliche Reihe}, journal = {Postprints der Universit{\"a}t Potsdam : Humanwissenschaftliche Reihe}, number = {653}, issn = {1866-8364}, doi = {10.25932/publishup-43106}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-431064}, pages = {331 -- 341}, year = {2014}, abstract = {A dimensional approach in psychiatry aims to identify core mechanisms of mental disorders across nosological boundaries. We compared anticipation of reward between major psychiatric disorders, and investigated whether reward anticipation is impaired in several mental disorders and whether there is a common psychopathological correlate (negative mood) of such an impairment. We used functional magnetic resonance imaging (fMRI) and a monetary incentive delay (MID) task to study the functional correlates of reward anticipation across major psychiatric disorders in 184 subjects, with the diagnoses of alcohol dependence (n = 26), schizophrenia (n = 44), major depressive disorder (MDD, n = 24), bipolar disorder (acute manic episode, n = 13), attention deficit/hyperactivity disorder (ADHD, n = 23), and healthy controls (n = 54). Subjects' individual Beck Depression Inventory-and State-Trait Anxiety Inventory-scores were correlated with clusters showing significant activation during reward anticipation. During reward anticipation, we observed significant group differences in ventral striatal (VS) activation: patients with schizophrenia, alcohol dependence, and major depression showed significantly less ventral striatal activation compared to healthy controls. Depressive symptoms correlated with dysfunction in reward anticipation regardless of diagnostic entity. There was no significant correlation between anxiety symptoms and VS functional activation. Our findings demonstrate a neurobiological dysfunction related to reward prediction that transcended disorder categories and was related to measures of depressed mood. The findings underline the potential of a dimensional approach in psychiatry and strengthen the hypothesis that neurobiological research in psychiatric disorders can be targeted at core mechanisms that are likely to be implicated in a range of clinical entities.}, language = {en} } @unpublished{GrapentinHeidlerKorschetal.2014, author = {Grapentin, Andreas and Heidler, Kirstin and Korsch, Dimitri and Kumar Sah, Rakesh and Kunzmann, Nicco and Henning, Johannes and Mattis, Toni and Rein, Patrick and Seckler, Eric and Groneberg, Bj{\"o}rn and Zimmermann, Florian}, title = {Embedded operating system projects}, number = {90}, editor = {Hentschel, Uwe and Richter, Daniel and Polze, Andreas}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-296-4}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-69154}, pages = {xi, 87}, year = {2014}, abstract = {In today's life, embedded systems are ubiquitous. But they differ from traditional desktop systems in many aspects - these include predictable timing behavior (real-time), the management of scarce resources (memory, network), reliable communication protocols, energy management, special purpose user-interfaces (headless operation), system configuration, programming languages (to support software/hardware co-design), and modeling techniques. Within this technical report, authors present results from the lecture "Operating Systems for Embedded Computing" that has been offered by the "Operating Systems and Middleware" group at HPI in Winter term 2013/14. Focus of the lecture and accompanying projects was on principles of real-time computing. Students had the chance to gather practical experience with a number of different OSes and applications and present experiences with near-hardware programming. Projects address the entire spectrum, from bare-metal programming to harnessing a real-time OS to exercising the full software/hardware co-design cycle. Three outstanding projects are at the heart of this technical report. Project 1 focuses on the development of a bare-metal operating system for LEGO Mindstorms EV3. While still a toy, it comes with a powerful ARM processor, 64 MB of main memory, standard interfaces, such as Bluetooth and network protocol stacks. EV3 runs a version of 1 1 Introduction Linux. Sources are available from Lego's web site. However, many devices and their driver software are proprietary and not well documented. Developing a new, bare-metal OS for the EV3 requires an understanding of the EV3 boot process. Since no standard input/output devices are available, initial debugging steps are tedious. After managing these initial steps, the project was able to adapt device drivers for a few Lego devices to an extent that a demonstrator (the Segway application) could be successfully run on the new OS. Project 2 looks at the EV3 from a different angle. The EV3 is running a pretty decent version of Linux- in principle, the RT_PREEMPT patch can turn any Linux system into a real-time OS by modifying the behavior of a number of synchronization constructs at the heart of the OS. Priority inversion is a problem that is solved by protocols such as priority inheritance or priority ceiling. Real-time OSes implement at least one of the protocols. The central idea of the project was the comparison of non-real-time and real-time variants of Linux on the EV3 hardware. A task set that showed effects of priority inversion on standard EV3 Linux would operate flawlessly on the Linux version with the RT_PREEMPT-patch applied. If only patching Lego's version of Linux was that easy... Project 3 takes the notion of real-time computing more seriously. The application scenario was centered around our Carrera Digital 132 racetrack. Obtaining position information from the track, controlling individual cars, detecting and modifying the Carrera Digital protocol required design and implementation of custom controller hardware. What to implement in hardware, firmware, and what to implement in application software - this was the central question addressed by the project.}, language = {en} } @phdthesis{Takouna2014, author = {Takouna, Ibrahim}, title = {Energy-efficient and performance-aware virtual machine management for cloud data centers}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-72399}, school = {Universit{\"a}t Potsdam}, year = {2014}, abstract = {Virtualisierte Cloud Datenzentren stellen nach Bedarf Ressourcen zur Verf{\"u}gu-ng, erm{\"o}glichen agile Ressourcenbereitstellung und beherbergen heterogene Applikationen mit verschiedenen Anforderungen an Ressourcen. Solche Datenzentren verbrauchen enorme Mengen an Energie, was die Erh{\"o}hung der Betriebskosten, der W{\"a}rme innerhalb der Zentren und des Kohlendioxidausstoßes verursacht. Der Anstieg des Energieverbrauches kann durch ein ineffektives Ressourcenmanagement, das die ineffiziente Ressourcenausnutzung verursacht, entstehen. Die vorliegende Dissertation stellt detaillierte Modelle und neue Verfahren f{\"u}r virtualisiertes Ressourcenmanagement in Cloud Datenzentren vor. Die vorgestellten Verfahren ziehen das Service-Level-Agreement (SLA) und die Heterogenit{\"a}t der Auslastung bez{\"u}glich des Bedarfs an Speicherzugriffen und Kommunikationsmustern von Web- und HPC- (High Performance Computing) Applikationen in Betracht. Um die pr{\"a}sentierten Techniken zu evaluieren, verwenden wir Simulationen und echte Protokollierung der Auslastungen von Web- und HPC- Applikationen. Außerdem vergleichen wir unser Techniken und Verfahren mit anderen aktuellen Verfahren durch die Anwendung von verschiedenen Performance Metriken. Die Hauptbeitr{\"a}ge dieser Dissertation sind Folgendes: Ein Proaktives auf robuster Optimierung basierendes Ressourcenbereitstellungsverfahren. Dieses Verfahren erh{\"o}ht die F{\"a}higkeit der Hostes zur Verf{\"u}g-ungsstellung von mehr VMs. Gleichzeitig aber wird der unn{\"o}tige Energieverbrauch minimiert. Zus{\"a}tzlich mindert diese Technik unerw{\"u}nschte {\"A}nde-rungen im Energiezustand des Servers. Die vorgestellte Technik nutzt einen auf Intervall basierenden Vorhersagealgorithmus zur Implementierung einer robusten Optimierung. Dabei werden unsichere Anforderungen in Betracht gezogen. Ein adaptives und auf Intervall basierendes Verfahren zur Vorhersage des Arbeitsaufkommens mit hohen, in k{\"u}rzer Zeit auftretenden Schwankungen. Die Intervall basierende Vorhersage ist implementiert in der Standard Abweichung Variante und in der Median absoluter Abweichung Variante. Die Intervall-{\"A}nderungen basieren auf einem adaptiven Vertrauensfenster um die Schwankungen des Arbeitsaufkommens zu bew{\"a}ltigen. Eine robuste VM Zusammenlegung f{\"u}r ein effizientes Energie und Performance Management. Dies erm{\"o}glicht die gegenseitige Abh{\"a}ngigkeit zwischen der Energie und der Performance zu minimieren. Unser Verfahren reduziert die Anzahl der VM-Migrationen im Vergleich mit den neu vor kurzem vorgestellten Verfahren. Dies tr{\"a}gt auch zur Reduzierung des durch das Netzwerk verursachten Energieverbrauches. Außerdem reduziert dieses Verfahren SLA-Verletzungen und die Anzahl von {\"A}nderungen an Energiezus-t{\"a}nden. Ein generisches Modell f{\"u}r das Netzwerk eines Datenzentrums um die verz{\"o}-gerte Kommunikation und ihre Auswirkung auf die VM Performance und auf die Netzwerkenergie zu simulieren. Außerdem wird ein generisches Modell f{\"u}r ein Memory-Bus des Servers vorgestellt. Dieses Modell beinhaltet auch Modelle f{\"u}r die Latenzzeit und den Energieverbrauch f{\"u}r verschiedene Memory Frequenzen. Dies erlaubt eine Simulation der Memory Verz{\"o}gerung und ihre Auswirkung auf die VM-Performance und auf den Memory Energieverbrauch. Kommunikation bewusste und Energie effiziente Zusammenlegung f{\"u}r parallele Applikationen um die dynamische Entdeckung von Kommunikationsmustern und das Umplanen von VMs zu erm{\"o}glichen. Das Umplanen von VMs benutzt eine auf den entdeckten Kommunikationsmustern basierende Migration. Eine neue Technik zur Entdeckung von dynamischen Mustern ist implementiert. Sie basiert auf der Signal Verarbeitung des Netzwerks von VMs, anstatt die Informationen des virtuellen Umstellung der Hosts oder der Initiierung der VMs zu nutzen. Das Ergebnis zeigt, dass unsere Methode die durchschnittliche Anwendung des Netzwerks reduziert und aufgrund der Reduzierung der aktiven Umstellungen Energie gespart. Außerdem bietet sie eine bessere VM Performance im Vergleich zu der CPU-basierten Platzierung. Memory bewusste VM Zusammenlegung f{\"u}r unabh{\"a}ngige VMs. Sie nutzt die Vielfalt des VMs Memory Zuganges um die Anwendung vom Memory-Bus der Hosts zu balancieren. Die vorgestellte Technik, Memory-Bus Load Balancing (MLB), verteilt die VMs reaktiv neu im Bezug auf ihre Anwendung vom Memory-Bus. Sie nutzt die VM Migration um die Performance des gesamtem Systems zu verbessern. Außerdem sind die dynamische Spannung, die Frequenz Skalierung des Memory und die MLB Methode kombiniert um ein besseres Energiesparen zu leisten.}, language = {en} } @phdthesis{Hebig2014, author = {Hebig, Regina}, title = {Evolution of model-driven engineering settings in practice}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-70761}, school = {Universit{\"a}t Potsdam}, year = {2014}, abstract = {Nowadays, software systems are getting more and more complex. To tackle this challenge most diverse techniques, such as design patterns, service oriented architectures (SOA), software development processes, and model-driven engineering (MDE), are used to improve productivity, while time to market and quality of the products stay stable. Multiple of these techniques are used in parallel to profit from their benefits. While the use of sophisticated software development processes is standard, today, MDE is just adopted in practice. However, research has shown that the application of MDE is not always successful. It is not fully understood when advantages of MDE can be used and to what degree MDE can also be disadvantageous for productivity. Further, when combining different techniques that aim to affect the same factor (e.g. productivity) the question arises whether these techniques really complement each other or, in contrast, compensate their effects. Due to that, there is the concrete question how MDE and other techniques, such as software development process, are interrelated. Both aspects (advantages and disadvantages for productivity as well as the interrelation to other techniques) need to be understood to identify risks relating to the productivity impact of MDE. Before studying MDE's impact on productivity, it is necessary to investigate the range of validity that can be reached for the results. This includes two questions. First, there is the question whether MDE's impact on productivity is similar for all approaches of adopting MDE in practice. Second, there is the question whether MDE's impact on productivity for an approach of using MDE in practice remains stable over time. The answers for both questions are crucial for handling risks of MDE, but also for the design of future studies on MDE success. This thesis addresses these questions with the goal to support adoption of MDE in future. To enable a differentiated discussion about MDE, the term MDE setting'' is introduced. MDE setting refers to the applied technical setting, i.e. the employed manual and automated activities, artifacts, languages, and tools. An MDE setting's possible impact on productivity is studied with a focus on changeability and the interrelation to software development processes. This is done by introducing a taxonomy of changeability concerns that might be affected by an MDE setting. Further, three MDE traits are identified and it is studied for which manifestations of these MDE traits software development processes are impacted. To enable the assessment and evaluation of an MDE setting's impacts, the Software Manufacture Model language is introduced. This is a process modeling language that allows to reason about how relations between (modeling) artifacts (e.g. models or code files) change during application of manual or automated development activities. On that basis, risk analysis techniques are provided. These techniques allow identifying changeability risks and assessing the manifestations of the MDE traits (and with it an MDE setting's impact on software development processes). To address the range of validity, MDE settings from practice and their evolution histories were capture in context of this thesis. First, this data is used to show that MDE settings cover the whole spectrum concerning their impact on changeability or interrelation to software development processes. Neither it is seldom that MDE settings are neutral for processes nor is it seldom that MDE settings have impact on processes. Similarly, the impact on changeability differs relevantly. Second, a taxonomy of evolution of MDE settings is introduced. In that context it is discussed to what extent different types of changes on an MDE setting can influence this MDE setting's impact on changeability and the interrelation to processes. The category of structural evolution, which can change these characteristics of an MDE setting, is identified. The captured MDE settings from practice are used to show that structural evolution exists and is common. In addition, some examples of structural evolution steps are collected that actually led to a change in the characteristics of the respective MDE settings. Two implications are: First, the assessed diversity of MDE settings evaluates the need for the analysis techniques that shall be presented in this thesis. Second, evolution is one explanation for the diversity of MDE settings in practice. To summarize, this thesis studies the nature and evolution of MDE settings in practice. As a result support for the adoption of MDE settings is provided in form of techniques for the identification of risks relating to productivity impacts.}, language = {en} } @article{Vierheller2014, author = {Vierheller, Janine}, title = {Exploratory Data Analysis}, series = {Process Design for Natural Scientists: an agile model-driven approach}, journal = {Process Design for Natural Scientists: an agile model-driven approach}, number = {500}, editor = {Lambrecht, Anna-Lena and Margaria, Tiziana}, publisher = {Axel Springer Verlag}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {110 -- 126}, year = {2014}, abstract = {In bioinformatics the term exploratory data analysis refers to different methods to get an overview of large biological data sets. Hence, it helps to create a framework for further analysis and hypothesis testing. The workflow facilitates this first important step of the data analysis created by high-throughput technologies. The results are different plots showing the structure of the measurements. The goal of the workflow is the automatization of the exploratory data analysis, but also the flexibility should be guaranteed. The basic tool is the free software R.}, language = {en} } @article{Teske2014, author = {Teske, Daniel}, title = {Geocoder accuracy ranking}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {161 -- 174}, year = {2014}, abstract = {Finding an address on a map is sometimes tricky: the chosen map application may be unfamiliar with the enclosed region. There are several geocoders on the market, they have different databases and algorithms to compute the query. Consequently, the geocoding results differ in their quality. Fortunately the geocoders provide a rich set of metadata. The workflow described in this paper compares this metadata with the aim to find out which geocoder is offering the best-fitting coordinate for a given address.}, language = {en} } @article{Holler2014, author = {Holler, Robin}, title = {GraffDok - a graffiti documentation application}, series = {Process design for natural scientists: an agile model-driven approach}, journal = {Process design for natural scientists: an agile model-driven approach}, number = {500}, publisher = {Springer}, address = {Berlin}, isbn = {978-3-662-45005-5}, issn = {1865-0929}, pages = {239 -- 251}, year = {2014}, abstract = {GraffDok is an application helping to maintain an overview over sprayed images somewhere in a city. At the time of writing it aims at vandalism rather than at beautiful photographic graffiti in an underpass. Looking at hundreds of tags and scribbles on monuments, house walls, etc. it would be interesting to not only record them in writing but even make them accessible electronically, including images. GraffDok's workflow is simple and only requires an EXIF-GPS-tagged photograph of a graffito. It automatically determines its location by using reverse geocoding with the given GPS-coordinates and the Gisgraphy WebService. While asking the user for some more meta data, GraffDok analyses the image in parallel with this and tries to detect fore- and background - before extracting the drawing lines and make them stand alone. The command line based tool ImageMagick is used here as well as for accessing EXIF data. Any meta data is written to csv-files, which will stay easily accessible and can be integrated in TeX-files as well. The latter ones are converted to PDF at the end of the workflow, containing a table about all graffiti and a summary for each - including the generated characteristic graffiti pattern image.}, language = {en} } @book{OPUS4-8627, title = {HPI Future SOC Lab}, editor = {Meinel, Christoph and Polze, Andreas and Oswald, Gerhard and Strotmann, Rolf and Seibold, Ulrich and Schulzki, Bernhard}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-86271}, publisher = {Universit{\"a}t Potsdam}, pages = {vi, 250}, year = {2014}, abstract = {Das Future SOC Lab am HPI ist eine Kooperation des Hasso-Plattner-Instituts mit verschiedenen Industriepartnern. Seine Aufgabe ist die Erm{\"o}glichung und F{\"o}rderung des Austausches zwischen Forschungsgemeinschaft und Industrie. Am Lab wird interessierten Wissenschaftlern eine Infrastruktur von neuester Hard- und Software kostenfrei f{\"u}r Forschungszwecke zur Verf{\"u}gung gestellt. Dazu z{\"a}hlen teilweise noch nicht am Markt verf{\"u}gbare Technologien, die im normalen Hochschulbereich in der Regel nicht zu finanzieren w{\"a}ren, bspw. Server mit bis zu 64 Cores und 2 TB Hauptspeicher. Diese Angebote richten sich insbesondere an Wissenschaftler in den Gebieten Informatik und Wirtschaftsinformatik. Einige der Schwerpunkte sind Cloud Computing, Parallelisierung und In-Memory Technologien. In diesem Technischen Bericht werden die Ergebnisse der Forschungsprojekte des Jahres 2014 vorgestellt. Ausgew{\"a}hlte Projekte stellten ihre Ergebnisse am 9. April 2014 und 29. Oktober 2014 im Rahmen der Future SOC Lab Tag Veranstaltungen vor.}, language = {en} }