@phdthesis{Najafi2023, author = {Najafi, Pejman}, title = {Leveraging data science \& engineering for advanced security operations}, doi = {10.25932/publishup-61225}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-612257}, school = {Universit{\"a}t Potsdam}, pages = {xix, 180}, year = {2023}, abstract = {The Security Operations Center (SOC) represents a specialized unit responsible for managing security within enterprises. To aid in its responsibilities, the SOC relies heavily on a Security Information and Event Management (SIEM) system that functions as a centralized repository for all security-related data, providing a comprehensive view of the organization's security posture. Due to the ability to offer such insights, SIEMS are considered indispensable tools facilitating SOC functions, such as monitoring, threat detection, and incident response. Despite advancements in big data architectures and analytics, most SIEMs fall short of keeping pace. Architecturally, they function merely as log search engines, lacking the support for distributed large-scale analytics. Analytically, they rely on rule-based correlation, neglecting the adoption of more advanced data science and machine learning techniques. This thesis first proposes a blueprint for next-generation SIEM systems that emphasize distributed processing and multi-layered storage to enable data mining at a big data scale. Next, with the architectural support, it introduces two data mining approaches for advanced threat detection as part of SOC operations. First, a novel graph mining technique that formulates threat detection within the SIEM system as a large-scale graph mining and inference problem, built on the principles of guilt-by-association and exempt-by-reputation. The approach entails the construction of a Heterogeneous Information Network (HIN) that models shared characteristics and associations among entities extracted from SIEM-related events/logs. Thereon, a novel graph-based inference algorithm is used to infer a node's maliciousness score based on its associations with other entities in the HIN. Second, an innovative outlier detection technique that imitates a SOC analyst's reasoning process to find anomalies/outliers. The approach emphasizes explainability and simplicity, achieved by combining the output of simple context-aware univariate submodels that calculate an outlier score for each entry. Both approaches were tested in academic and real-world settings, demonstrating high performance when compared to other algorithms as well as practicality alongside a large enterprise's SIEM system. This thesis establishes the foundation for next-generation SIEM systems that can enhance today's SOCs and facilitate the transition from human-centric to data-driven security operations.}, language = {en} } @article{Perscheid2021, author = {Perscheid, Cindy}, title = {Integrative biomarker detection on high-dimensional gene expression data sets}, series = {Briefings in bioinformatics}, volume = {22}, journal = {Briefings in bioinformatics}, number = {3}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {1467-5463}, doi = {10.1093/bib/bbaa151}, pages = {18}, year = {2021}, abstract = {Gene expression data provide the expression levels of tens of thousands of genes from several hundred samples. These data are analyzed to detect biomarkers that can be of prognostic or diagnostic use. Traditionally, biomarker detection for gene expression data is the task of gene selection. The vast number of genes is reduced to a few relevant ones that achieve the best performance for the respective use case. Traditional approaches select genes based on their statistical significance in the data set. This results in issues of robustness, redundancy and true biological relevance of the selected genes. Integrative analyses typically address these shortcomings by integrating multiple data artifacts from the same objects, e.g. gene expression and methylation data. When only gene expression data are available, integrative analyses instead use curated information on biological processes from public knowledge bases. With knowledge bases providing an ever-increasing amount of curated biological knowledge, such prior knowledge approaches become more powerful. This paper provides a thorough overview on the status quo of biomarker detection on gene expression data with prior biological knowledge. We discuss current shortcomings of traditional approaches, review recent external knowledge bases, provide a classification and qualitative comparison of existing prior knowledge approaches and discuss open challenges for this kind of gene selection.}, language = {en} } @article{Perscheid2021, author = {Perscheid, Cindy}, title = {Comprior}, series = {BMC Bioinformatics}, volume = {22}, journal = {BMC Bioinformatics}, publisher = {Springer Nature}, address = {London}, issn = {1471-2105}, doi = {10.1186/s12859-021-04308-z}, pages = {1 -- 15}, year = {2021}, abstract = {Background Reproducible benchmarking is important for assessing the effectiveness of novel feature selection approaches applied on gene expression data, especially for prior knowledge approaches that incorporate biological information from online knowledge bases. However, no full-fledged benchmarking system exists that is extensible, provides built-in feature selection approaches, and a comprehensive result assessment encompassing classification performance, robustness, and biological relevance. Moreover, the particular needs of prior knowledge feature selection approaches, i.e. uniform access to knowledge bases, are not addressed. As a consequence, prior knowledge approaches are not evaluated amongst each other, leaving open questions regarding their effectiveness. Results We present the Comprior benchmark tool, which facilitates the rapid development and effortless benchmarking of feature selection approaches, with a special focus on prior knowledge approaches. Comprior is extensible by custom approaches, offers built-in standard feature selection approaches, enables uniform access to multiple knowledge bases, and provides a customizable evaluation infrastructure to compare multiple feature selection approaches regarding their classification performance, robustness, runtime, and biological relevance. Conclusion Comprior allows reproducible benchmarking especially of prior knowledge approaches, which facilitates their applicability and for the first time enables a comprehensive assessment of their effectiveness}, language = {en} } @article{LosterKoumarelasNaumann2021, author = {Loster, Michael and Koumarelas, Ioannis and Naumann, Felix}, title = {Knowledge transfer for entity resolution with siamese neural networks}, series = {ACM journal of data and information quality}, volume = {13}, journal = {ACM journal of data and information quality}, number = {1}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3410157}, pages = {25}, year = {2021}, abstract = {The integration of multiple data sources is a common problem in a large variety of applications. Traditionally, handcrafted similarity measures are used to discover, merge, and integrate multiple representations of the same entity-duplicates-into a large homogeneous collection of data. Often, these similarity measures do not cope well with the heterogeneity of the underlying dataset. In addition, domain experts are needed to manually design and configure such measures, which is both time-consuming and requires extensive domain expertise.
We propose a deep Siamese neural network, capable of learning a similarity measure that is tailored to the characteristics of a particular dataset. With the properties of deep learning methods, we are able to eliminate the manual feature engineering process and thus considerably reduce the effort required for model construction. In addition, we show that it is possible to transfer knowledge acquired during the deduplication of one dataset to another, and thus significantly reduce the amount of data required to train a similarity measure. We evaluated our method on multiple datasets and compare our approach to state-of-the-art deduplication methods. Our approach outperforms competitors by up to +26 percent F-measure, depending on task and dataset. In addition, we show that knowledge transfer is not only feasible, but in our experiments led to an improvement in F-measure of up to +4.7 percent.}, language = {en} } @phdthesis{Baier2015, author = {Baier, Thomas}, title = {Matching events and activities}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-84548}, school = {Universit{\"a}t Potsdam}, pages = {xxii, 213}, year = {2015}, abstract = {Nowadays, business processes are increasingly supported by IT services that produce massive amounts of event data during process execution. Aiming at a better process understanding and improvement, this event data can be used to analyze processes using process mining techniques. Process models can be automatically discovered and the execution can be checked for conformance to specified behavior. Moreover, existing process models can be enhanced and annotated with valuable information, for example for performance analysis. While the maturity of process mining algorithms is increasing and more tools are entering the market, process mining projects still face the problem of different levels of abstraction when comparing events with modeled business activities. Mapping the recorded events to activities of a given process model is essential for conformance checking, annotation and understanding of process discovery results. Current approaches try to abstract from events in an automated way that does not capture the required domain knowledge to fit business activities. Such techniques can be a good way to quickly reduce complexity in process discovery. Yet, they fail to enable techniques like conformance checking or model annotation, and potentially create misleading process discovery results by not using the known business terminology. In this thesis, we develop approaches that abstract an event log to the same level that is needed by the business. Typically, this abstraction level is defined by a given process model. Thus, the goal of this thesis is to match events from an event log to activities in a given process model. To accomplish this goal, behavioral and linguistic aspects of process models and event logs as well as domain knowledge captured in existing process documentation are taken into account to build semiautomatic matching approaches. The approaches establish a pre--processing for every available process mining technique that produces or annotates a process model, thereby reducing the manual effort for process analysts. While each of the presented approaches can be used in isolation, we also introduce a general framework for the integration of different matching approaches. The approaches have been evaluated in case studies with industry and using a large industry process model collection and simulated event logs. The evaluation demonstrates the effectiveness and efficiency of the approaches and their robustness towards nonconforming execution logs.}, language = {en} } @book{HerbstMaschlerNiephausetal.2015, author = {Herbst, Eva-Maria and Maschler, Fabian and Niephaus, Fabio and Reimann, Max and Steier, Julia and Felgentreff, Tim and Lincke, Jens and Taeumel, Marcel and Hirschfeld, Robert and Witt, Carsten}, title = {ecoControl}, number = {93}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-318-3}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-72147}, publisher = {Universit{\"a}t Potsdam}, pages = {viii, 142}, year = {2015}, abstract = {Eine dezentrale Energieversorgung ist ein erster Schritt in Richtung Energiewende. Dabei werden auch in Mehrfamilienh{\"a}usern vermehrt verschiedene Strom- und W{\"a}rmeerzeuger eingesetzt. Besonders in Deutschland kommen in diesem Zusammenhang Blockheizkraftwerke immer h{\"a}ufiger zum Einsatz, weil sie Gas sehr effizient in Strom und W{\"a}rme umwandeln k{\"o}nnen. Außerdem erm{\"o}glichen sie, im Zusammenspiel mit anderen Energiesystemen wie beispielsweise Photovoltaik-Anlagen, eine kontinuierliche und dezentrale Energieversorgung. Bei dem Betrieb von unterschiedlichen Energiesystemen ist es w{\"u}nschenswert, dass die Systeme aufeinander abgestimmt arbeiten. Allerdings ist es bisher schwierig, heterogene Energiesysteme effizient miteinander zu betreiben. Dadurch bleiben Einsparungspotentiale ungenutzt. Eine zentrale Steuerung kann deshalb die Effizienz des Gesamtsystems verbessern. Mit ecoControl stellen wir einen erweiterbaren Prototypen vor, der die Kooperation von Energiesystemen optimiert und Umweltfaktoren miteinbezieht. Dazu stellt die Software eine einheitliche Bedienungsoberfl{\"a}che zur Konfiguration aller Systeme zur Verf{\"u}gung. Außerdem bietet sie die M{\"o}glichkeit, Optimierungsalgorithmen mit Hilfe einer Programmierschnittstelle zu entwickeln, zu testen und auszuf{\"u}hren. Innerhalb solcher Algorithmen k{\"o}nnen von ecoControl bereitgestellte Vorhersagen genutzt werden. Diese Vorhersagen basieren auf dem individuellen Verhalten von jedem Energiesystem, Wettervorhersagen und auf Prognosen des Energieverbrauchs. Mithilfe einer Simulation k{\"o}nnen Techniker unterschiedliche Konfigurationen und Optimierungen sofort ausprobieren, ohne diese {\"u}ber einen langen Zeitraum an realen Ger{\"a}ten testen zu m{\"u}ssen. ecoControl hilft dar{\"u}ber hinaus auch Hausverwaltungen und Vermietern bei der Verwaltung und Analyse der Energiekosten. Wir haben anhand von Fallbeispielen gezeigt, dass Optimierungsalgorithmen, welche die Nutzung von W{\"a}rmespeichern verbessern, die Effizienz des Gesamtsystems erheblich verbessern k{\"o}nnen. Schließlich kommen wir zu dem Schluss, dass ecoControl in einem n{\"a}chsten Schritt unter echten Bedingungen getestet werden muss, sobald eine geeignete Hardwarekomponente verf{\"u}gbar ist. {\"U}ber diese Schnittstelle werden die Messwerte an ecoControl gesendet und Steuersignale an die Ger{\"a}te weitergeleitet.}, language = {de} } @article{DittmarBuchholzKuehn2016, author = {Dittmar, Anke and Buchholz, Gregor and K{\"u}hn, Mathias}, title = {Eine Studie zum kollaborativen Modellieren in der Softwaretechnik-Ausbildung}, series = {Commentarii informaticae didacticae (CID)}, journal = {Commentarii informaticae didacticae (CID)}, number = {10}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-376-3}, issn = {1868-0844}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-94806}, pages = {41 -- 53}, year = {2016}, abstract = {Die Vermittlung von Modellierungsf{\"a}higkeiten in der Softwaretechnik-Ausbildung konzentriert sich meist auf Modellierungskonzepte, Notationen und Entwicklungswerkzeuge. Die Betrachtung der Modellierungsaktivit{\"a}ten, etwa die Entwicklung und Gegen{\"u}berstellung alternativer Modellvorschl{\"a}ge, steht weniger im Vordergrund. Die vorliegende Studie untersucht zwei Formen des kollaborativen Modellierens am Tabletop in Bezug auf ihren Einfluss auf die Modellierungsaktivit{\"a}ten in kleinen Gruppen. Die Ergebnisse zeigen, dass sowohl selbstorganisierte als auch moderierte Modellierungssitzungen das Entwickeln eines gemeinsamen Modellverst{\"a}ndnisses f{\"o}rdern. In moderierten Sitzungen wurden zudem mehr alternative L{\"o}sungsideen entwickelt und in st{\"a}rkerem Maße diskutiert.}, language = {de} } @phdthesis{AlSaffar2016, author = {Al-Saffar, Loay Talib Ahmed}, title = {Analysing prerequisites, expectations, apprehensions, and attitudes of university students studying Computer science}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-98437}, school = {Universit{\"a}t Potsdam}, pages = {xii, 131}, year = {2016}, abstract = {The main objective of this dissertation is to analyse prerequisites, expectations, apprehensions, and attitudes of students studying computer science, who are willing to gain a bachelor degree. The research will also investigate in the students' learning style according to the Felder-Silverman model. These investigations fall in the attempt to make an impact on reducing the "dropout"/shrinkage rate among students, and to suggest a better learning environment. The first investigation starts with a survey that has been made at the computer science department at the University of Baghdad to investigate the attitudes of computer science students in an environment dominated by women, showing the differences in attitudes between male and female students in different study years. Students are accepted to university studies via a centrally controlled admission procedure depending mainly on their final score at school. This leads to a high percentage of students studying subjects they do not want. Our analysis shows that 75\% of the female students do not regret studying computer science although it was not their first choice. And according to statistics over previous years, women manage to succeed in their study and often graduate on top of their class. We finish with a comparison of attitudes between the freshman students of two different cultures and two different university enrolment procedures (University of Baghdad, in Iraq, and the University of Potsdam, in Germany) both with opposite gender majority. The second step of investigation took place at the department of computer science at the University of Potsdam in Germany and analyzes the learning styles of students studying the three major fields of study offered by the department (computer science, business informatics, and computer science teaching). Investigating the differences in learning styles between the students of those study fields who usually take some joint courses is important to be aware of which changes are necessary to be adopted in the teaching methods to address those different students. It was a two stage study using two questionnaires; the main one is based on the Index of Learning Styles Questionnaire of B. A. Solomon and R. M. Felder, and the second questionnaire was an investigation on the students' attitudes towards the findings of their personal first questionnaire. Our analysis shows differences in the preferences of learning style between male and female students of the different study fields, as well as differences between students with the different specialties (computer science, business informatics, and computer science teaching). The third investigation looks closely into the difficulties, issues, apprehensions and expectations of freshman students studying computer science. The study took place at the computer science department at the University of Potsdam with a volunteer sample of students. The goal is to determine and discuss the difficulties and issues that they are facing in their study that may lead them to think in dropping-out, changing the study field, or changing the university. The research continued with the same sample of students (with business informatics students being the majority) through more than three semesters. Difficulties and issues during the study were documented, as well as students' attitudes, apprehensions, and expectations. Some of the professors and lecturers opinions and solutions to some students' problems were also documented. Many participants had apprehensions and difficulties, especially towards informatics subjects. Some business informatics participants began to think of changing the university, in particular when they reached their third semester, others thought about changing their field of study. Till the end of this research, most of the participants continued in their studies (the study they have started with or the new study they have changed to) without leaving the higher education system.}, language = {en} } @book{NeuhausPolzeChowdhuryy2011, author = {Neuhaus, Christian and Polze, Andreas and Chowdhuryy, Mohammad M. R.}, title = {Survey on healthcare IT systems : standards, regulations and security}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-128-8}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-51463}, publisher = {Universit{\"a}t Potsdam}, pages = {53}, year = {2011}, abstract = {IT systems for healthcare are a complex and exciting field. One the one hand, there is a vast number of improvements and work alleviations that computers can bring to everyday healthcare. Some ways of treatment, diagnoses and organisational tasks were even made possible by computer usage in the first place. On the other hand, there are many factors that encumber computer usage and make development of IT systems for healthcare a challenging, sometimes even frustrating task. These factors are not solely technology-related, but just as well social or economical conditions. This report describes some of the idiosyncrasies of IT systems in the healthcare domain, with a special focus on legal regulations, standards and security.}, language = {en} } @book{AbedjanNaumann2011, author = {Abedjan, Ziawasch and Naumann, Felix}, title = {Advancing the discovery of unique column combinations}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-148-6}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53564}, publisher = {Universit{\"a}t Potsdam}, pages = {25}, year = {2011}, abstract = {Unique column combinations of a relational database table are sets of columns that contain only unique values. Discovering such combinations is a fundamental research problem and has many different data management and knowledge discovery applications. Existing discovery algorithms are either brute force or have a high memory load and can thus be applied only to small datasets or samples. In this paper, the wellknown GORDIAN algorithm and "Apriori-based" algorithms are compared and analyzed for further optimization. We greatly improve the Apriori algorithms through efficient candidate generation and statistics-based pruning methods. A hybrid solution HCAGORDIAN combines the advantages of GORDIAN and our new algorithm HCA, and it significantly outperforms all previous work in many situations.}, language = {en} } @article{SchlierkampThurner2015, author = {Schlierkamp, Kathrin and Thurner, Veronika}, title = {Was will ich eigentlich hier?}, series = {HDI 2014 : Gestalten von {\"U}berg{\"a}ngen}, volume = {2015}, journal = {HDI 2014 : Gestalten von {\"U}berg{\"a}ngen}, number = {9}, editor = {Schubert, Sigrid and Schwill, Andreas}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-84748}, pages = {179 -- 187}, year = {2015}, abstract = {Die Wahl des richtigen Studienfaches und die daran anschließende Studieneingangsphase sind oft entscheidend f{\"u}r den erfolgreichen Verlauf eines Studiums. Eine große Herausforderung besteht dabei darin, bereits in den ersten Wochen des Studiums bestehende Defizite in vermeintlich einfachen Schl{\"u}sselkompetenzen zu erkennen und diese so bald wie m{\"o}glich zu beheben. Eine zweite, nicht minder wichtige Herausforderung ist es, m{\"o}glichst fr{\"u}hzeitig f{\"u}r jeden einzelnen Studierenden zu erkennen, ob er bzw. sie das individuell richtige Studienfach gew{\"a}hlt hat, das den jeweiligen pers{\"o}nlichen Neigungen, Interessen und F{\"a}higkeiten entspricht und zur Verwirklichung der eigenen Lebensziele beitr{\"a}gt. Denn nur dann sind Studierende ausreichend stark und dauerhaft intrinsisch motiviert, um ein anspruchsvolles, komplexes Studium erfolgreich durchzuziehen. In diesem Beitrag fokussieren wir eine Maßnahme, die die Studierenden an einen Prozess zur systematischen Reflexion des eigenen Lernprozesses und der eigenen Ziele heranf{\"u}hrt und beides in Relation setzt.}, language = {de} } @article{VossebergCzernikErbetal.2015, author = {Vosseberg, Karin and Czernik, Sofie and Erb, Ulrike and Vielhaber, Michael}, title = {Projektorientierte Studieneingangsphase}, series = {HDI 2014 : Gestalten von {\"U}berg{\"a}ngen}, volume = {2015}, journal = {HDI 2014 : Gestalten von {\"U}berg{\"a}ngen}, number = {9}, editor = {Schubert, Sigrid and Schwill, Andreas}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-84730}, pages = {169 -- 177}, year = {2015}, abstract = {Ziel einer neuen Studieneingangsphase ist, den Studierenden bis zum Ende des ersten Semesters ein vielf{\"a}ltiges Berufsbild der Informatik und Wirtschaftsinformatik mit dem breiten Aufgabenspektrum aufzubl{\"a}ttern und damit die Zusammenh{\"a}nge zwischen den einzelnen Modulen des Curriculums zu verdeutlichen. Die Studierenden sollen in die Lage versetzt werden, sehr eigenst{\"a}ndig die Planung und Gestaltung ihres Studiums in die Hand zu nehmen.}, language = {de} } @article{Broeker2015, author = {Br{\"o}ker, Kathrin}, title = {Unterst{\"u}tzung Informatik-Studierender durch ein Lernzentrum}, series = {HDI 2014 : Gestalten von {\"U}berg{\"a}ngen}, volume = {2015}, journal = {HDI 2014 : Gestalten von {\"U}berg{\"a}ngen}, number = {9}, editor = {Schubert, Sigrid and Schwill, Andreas}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-84754}, pages = {189 -- 197}, year = {2015}, abstract = {In diesem Papier wird das Konzept eines Lernzentrums f{\"u}r die Informatik (LZI) an der Universit{\"a}t Paderborn vorgestellt. Ausgehend von den fachspezifischen Schwierigkeiten der Informatik Studierenden werden die Angebote des LZIs erl{\"a}utert, die sich {\"u}ber die vier Bereiche Individuelle Beratung und Betreuung, „Offener Lernraum", Workshops und Lehrveranstaltungen sowie Forschung erstrecken. Eine erste Evaluation mittels Feedbackb{\"o}gen zeigt, dass das Angebot bei den Studierenden positiv aufgenommen wird. Zuk{\"u}nftig soll das Angebot des LZIs weiter ausgebaut und verbessert werden. Ausgangsbasis dazu sind weitere Studien.}, language = {de} } @phdthesis{Prasse2016, author = {Prasse, Paul}, title = {Pattern recognition for computer security}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-100251}, school = {Universit{\"a}t Potsdam}, pages = {VI, 75}, year = {2016}, abstract = {Computer Security deals with the detection and mitigation of threats to computer networks, data, and computing hardware. This thesis addresses the following two computer security problems: email spam campaign and malware detection. Email spam campaigns can easily be generated using popular dissemination tools by specifying simple grammars that serve as message templates. A grammar is disseminated to nodes of a bot net, the nodes create messages by instantiating the grammar at random. Email spam campaigns can encompass huge data volumes and therefore pose a threat to the stability of the infrastructure of email service providers that have to store them. Malware -software that serves a malicious purpose- is affecting web servers, client computers via active content, and client computers through executable files. Without the help of malware detection systems it would be easy for malware creators to collect sensitive information or to infiltrate computers. The detection of threats -such as email-spam messages, phishing messages, or malware- is an adversarial and therefore intrinsically difficult problem. Threats vary greatly and evolve over time. The detection of threats based on manually-designed rules is therefore difficult and requires a constant engineering effort. Machine-learning is a research area that revolves around the analysis of data and the discovery of patterns that describe aspects of the data. Discriminative learning methods extract prediction models from data that are optimized to predict a target attribute as accurately as possible. Machine-learning methods hold the promise of automatically identifying patterns that robustly and accurately detect threats. This thesis focuses on the design and analysis of discriminative learning methods for the two computer-security problems under investigation: email-campaign and malware detection. The first part of this thesis addresses email-campaign detection. We focus on regular expressions as a syntactic framework, because regular expressions are intuitively comprehensible by security engineers and administrators, and they can be applied as a detection mechanism in an extremely efficient manner. In this setting, a prediction model is provided with exemplary messages from an email-spam campaign. The prediction model has to generate a regular expression that reveals the syntactic pattern that underlies the entire campaign, and that a security engineers finds comprehensible and feels confident enough to use the expression to blacklist further messages at the email server. We model this problem as two-stage learning problem with structured input and output spaces which can be solved using standard cutting plane methods. Therefore we develop an appropriate loss function, and derive a decoder for the resulting optimization problem. The second part of this thesis deals with the problem of predicting whether a given JavaScript or PHP file is malicious or benign. Recent malware analysis techniques use static or dynamic features, or both. In fully dynamic analysis, the software or script is executed and observed for malicious behavior in a sandbox environment. By contrast, static analysis is based on features that can be extracted directly from the program file. In order to bypass static detection mechanisms, code obfuscation techniques are used to spread a malicious program file in many different syntactic variants. Deobfuscating the code before applying a static classifier can be subjected to mostly static code analysis and can overcome the problem of obfuscated malicious code, but on the other hand increases the computational costs of malware detection by an order of magnitude. In this thesis we present a cascaded architecture in which a classifier first performs a static analysis of the original code and -based on the outcome of this first classification step- the code may be deobfuscated and classified again. We explore several types of features including token \$n\$-grams, orthogonal sparse bigrams, subroutine-hashings, and syntax-tree features and study the robustness of detection methods and feature types against the evolution of malware over time. The developed tool scans very large file collections quickly and accurately. Each model is evaluated on real-world data and compared to reference methods. Our approach of inferring regular expressions to filter emails belonging to an email spam campaigns leads to models with a high true-positive rate at a very low false-positive rate that is an order of magnitude lower than that of a commercial content-based filter. Our presented system -REx-SVMshort- is being used by a commercial email service provider and complements content-based and IP-address based filtering. Our cascaded malware detection system is evaluated on a high-quality data set of almost 400,000 conspicuous PHP files and a collection of more than 1,00,000 JavaScript files. From our case study we can conclude that our system can quickly and accurately process large data collections at a low false-positive rate.}, language = {en} } @phdthesis{Abedjan2014, author = {Abedjan, Ziawasch}, title = {Improving RDF data with data mining}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-71334}, school = {Universit{\"a}t Potsdam}, year = {2014}, abstract = {Linked Open Data (LOD) comprises very many and often large public data sets and knowledge bases. Those datasets are mostly presented in the RDF triple structure of subject, predicate, and object, where each triple represents a statement or fact. Unfortunately, the heterogeneity of available open data requires significant integration steps before it can be used in applications. Meta information, such as ontological definitions and exact range definitions of predicates, are desirable and ideally provided by an ontology. However in the context of LOD, ontologies are often incomplete or simply not available. Thus, it is useful to automatically generate meta information, such as ontological dependencies, range definitions, and topical classifications. Association rule mining, which was originally applied for sales analysis on transactional databases, is a promising and novel technique to explore such data. We designed an adaptation of this technique for min-ing Rdf data and introduce the concept of "mining configurations", which allows us to mine RDF data sets in various ways. Different configurations enable us to identify schema and value dependencies that in combination result in interesting use cases. To this end, we present rule-based approaches for auto-completion, data enrichment, ontology improvement, and query relaxation. Auto-completion remedies the problem of inconsistent ontology usage, providing an editing user with a sorted list of commonly used predicates. A combination of different configurations step extends this approach to create completely new facts for a knowledge base. We present two approaches for fact generation, a user-based approach where a user selects the entity to be amended with new facts and a data-driven approach where an algorithm discovers entities that have to be amended with missing facts. As knowledge bases constantly grow and evolve, another approach to improve the usage of RDF data is to improve existing ontologies. Here, we present an association rule based approach to reconcile ontology and data. Interlacing different mining configurations, we infer an algorithm to discover synonymously used predicates. Those predicates can be used to expand query results and to support users during query formulation. We provide a wide range of experiments on real world datasets for each use case. The experiments and evaluations show the added value of association rule mining for the integration and usability of RDF data and confirm the appropriateness of our mining configuration methodology.}, language = {en} } @inproceedings{OPUS4-7665, title = {Proceedings of the Second HPI Cloud Symposium "Operating the Cloud" 2014}, number = {94}, editor = {Bosse, Sascha and Elsaid, Mohamed Esam and Feinbube, Frank and M{\"u}ller, Hendrik}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-319-0}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-76654}, pages = {vii, 59}, year = {2015}, abstract = {Every year, the Hasso Plattner Institute (HPI) invites guests from industry and academia to a collaborative scientific workshop on the topic "Operating the Cloud". Our goal is to provide a forum for the exchange of knowledge and experience between industry and academia. Hence, HPI's Future SOC Lab is the adequate environment to host this event which is also supported by BITKOM. On the occasion of this workshop we called for submissions of research papers and practitioners' reports. "Operating the Cloud" aims to be a platform for productive discussions of innovative ideas, visions, and upcoming technologies in the field of cloud operation and administration. In this workshop proceedings the results of the second HPI cloud symposium "Operating the Cloud" 2014 are published. We thank the authors for exciting presentations and insights into their current work and research. Moreover, we look forward to more interesting submissions for the upcoming symposium in 2015.}, language = {en} } @phdthesis{Videla2014, author = {Videla, Santiago}, title = {Reasoning on the response of logical signaling networks with answer set programming}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-71890}, school = {Universit{\"a}t Potsdam}, year = {2014}, abstract = {Deciphering the functioning of biological networks is one of the central tasks in systems biology. In particular, signal transduction networks are crucial for the understanding of the cellular response to external and internal perturbations. Importantly, in order to cope with the complexity of these networks, mathematical and computational modeling is required. We propose a computational modeling framework in order to achieve more robust discoveries in the context of logical signaling networks. More precisely, we focus on modeling the response of logical signaling networks by means of automated reasoning using Answer Set Programming (ASP). ASP provides a declarative language for modeling various knowledge representation and reasoning problems. Moreover, available ASP solvers provide several reasoning modes for assessing the multitude of answer sets. Therefore, leveraging its rich modeling language and its highly efficient solving capacities, we use ASP to address three challenging problems in the context of logical signaling networks: learning of (Boolean) logical networks, experimental design, and identification of intervention strategies. Overall, the contribution of this thesis is three-fold. Firstly, we introduce a mathematical framework for characterizing and reasoning on the response of logical signaling networks. Secondly, we contribute to a growing list of successful applications of ASP in systems biology. Thirdly, we present a software providing a complete pipeline for automated reasoning on the response of logical signaling networks.}, language = {en} } @book{MeyerWeske2014, author = {Meyer, Andreas and Weske, Mathias}, title = {Weak conformance between process models and synchronized object life cycles}, number = {91}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-303-9}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-71722}, publisher = {Universit{\"a}t Potsdam}, pages = {31}, year = {2014}, abstract = {Process models specify behavioral execution constraints between activities as well as between activities and data objects. A data object is characterized by its states and state transitions represented as object life cycle. For process execution, all behavioral execution constraints must be correct. Correctness can be verified via soundness checking which currently only considers control flow information. For data correctness, conformance between a process model and its object life cycles is checked. Current approaches abstract from dependencies between multiple data objects and require fully specified process models although, in real-world process repositories, often underspecified models are found. Coping with these issues, we introduce the concept of synchronized object life cycles and we define a mapping of data constraints of a process model to Petri nets extending an existing mapping. Further, we apply the notion of weak conformance to process models to tell whether each time an activity needs to access a data object in a particular state, it is guaranteed that the data object is in or can reach the expected state. Then, we introduce an algorithm for an integrated verification of control flow correctness and weak data conformance using soundness checking.}, language = {en} } @article{WesselsMetzger2015, author = {Weßels, Doris and Metzger, Christiane}, title = {Die Arbeitswelt im Fokus}, series = {HDI 2014 : Gestalten von {\"U}berg{\"a}ngen}, volume = {2015}, journal = {HDI 2014 : Gestalten von {\"U}berg{\"a}ngen}, number = {9}, editor = {Schwill, Andreas and Schubert, Sigrid}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-80289}, pages = {77 -- 92}, year = {2015}, abstract = {F{\"u}r Bachelor-Studierende der Wirtschaftsinformatik im zweiten Semester an der Fachhochschule Kiel werden im Modul Informationsmanagement neben klassischen didaktischen Ans{\"a}tzen in einer seminaristischen Unterrichtsform so genannte „Aktivbausteine" eingesetzt: Studierende erhalten zum einen die Gelegenheit, sich im Kontakt mit Fach- und F{\"u}hrungskr{\"a}ften aus der Industrie ein konkretes Bild vom Beruf der Wirtschaftsinformatikerin bzw. des Wirtschaftsinformatikers zu machen; zum anderen erarbeiten sie innovative Ans{\"a}tze der Prozessverbesserung aus Sicht der IT oder mit Nutzenpotenzial f{\"u}r die IT und pr{\"a}sentieren ihre Ergebnisse {\"o}ffentlich im Rahmen des Kieler Prozessmanagementforums. Diese Aktivbausteine dienen insbesondere der Berufsfeldorientierung: Durch die Informationen, die die Studierenden {\"u}ber die Anforderungen und T{\"a}tigkeiten von im Beruf stehenden Menschen erhalten, werden sie in die Lage versetzt, fundierte Entscheidungen bzgl. ihrer Studiengestaltung und Berufswahl zu treffen. Im Beitrag wird die Konzeption der Bausteine vorgestellt und deren Grad der Zielerreichung durch aktuelle Evaluationsergebnisse erl{\"a}utert. Zudem wird die motivationale Wirkung der Aktivbausteine anhand der Theorie der Selbstbestimmung von Deci und Ryan [DR1985, DR1993, DR2004] erl{\"a}utert.}, language = {de} } @phdthesis{Haider2013, author = {Haider, Peter}, title = {Prediction with Mixture Models}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-69617}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {Learning a model for the relationship between the attributes and the annotated labels of data examples serves two purposes. Firstly, it enables the prediction of the label for examples without annotation. Secondly, the parameters of the model can provide useful insights into the structure of the data. If the data has an inherent partitioned structure, it is natural to mirror this structure in the model. Such mixture models predict by combining the individual predictions generated by the mixture components which correspond to the partitions in the data. Often the partitioned structure is latent, and has to be inferred when learning the mixture model. Directly evaluating the accuracy of the inferred partition structure is, in many cases, impossible because the ground truth cannot be obtained for comparison. However it can be assessed indirectly by measuring the prediction accuracy of the mixture model that arises from it. This thesis addresses the interplay between the improvement of predictive accuracy by uncovering latent cluster structure in data, and further addresses the validation of the estimated structure by measuring the accuracy of the resulting predictive model. In the application of filtering unsolicited emails, the emails in the training set are latently clustered into advertisement campaigns. Uncovering this latent structure allows filtering of future emails with very low false positive rates. In order to model the cluster structure, a Bayesian clustering model for dependent binary features is developed in this thesis. Knowing the clustering of emails into campaigns can also aid in uncovering which emails have been sent on behalf of the same network of captured hosts, so-called botnets. This association of emails to networks is another layer of latent clustering. Uncovering this latent structure allows service providers to further increase the accuracy of email filtering and to effectively defend against distributed denial-of-service attacks. To this end, a discriminative clustering model is derived in this thesis that is based on the graph of observed emails. The partitionings inferred using this model are evaluated through their capacity to predict the campaigns of new emails. Furthermore, when classifying the content of emails, statistical information about the sending server can be valuable. Learning a model that is able to make use of it requires training data that includes server statistics. In order to also use training data where the server statistics are missing, a model that is a mixture over potentially all substitutions thereof is developed. Another application is to predict the navigation behavior of the users of a website. Here, there is no a priori partitioning of the users into clusters, but to understand different usage scenarios and design different layouts for them, imposing a partitioning is necessary. The presented approach simultaneously optimizes the discriminative as well as the predictive power of the clusters. Each model is evaluated on real-world data and compared to baseline methods. The results show that explicitly modeling the assumptions about the latent cluster structure leads to improved predictions compared to the baselines. It is beneficial to incorporate a small number of hyperparameters that can be tuned to yield the best predictions in cases where the prediction accuracy can not be optimized directly.}, language = {en} }