@article{GonschorekLangerBernhardtetal.2016, author = {Gonschorek, Julia and Langer, Anja and Bernhardt, Benjamin and Raebiger, Caroline}, title = {Big Data in the Field of Civil Security Research: Approaches for the Visual Preprocessing of Fire Brigade Operations}, series = {Science}, volume = {7}, journal = {Science}, publisher = {IGI Global}, address = {Hershey}, issn = {1947-3192}, doi = {10.4018/IJAEIS.2016010104}, pages = {54 -- 64}, year = {2016}, abstract = {This article gives insight in a running dissertation at the University in Potsdam. Point of discussion is the spatial and temporal distribution of emergencies of German fire brigades that have not sufficiently been scientifically examined. The challenge is seen in Big Data: enormous amounts of data that exist now (or can be collected in the future) and whose variables are linked to one another. These analyses and visualizations can form a basis for strategic, operational and tactical planning, as well as prevention measures. The user-centered (geo-) visualization of fire brigade data accessible to the general public is a scientific contribution to the research topic 'geovisual analytics and geographical profiling'. It may supplement antiquated methods such as the so-called pinmaps as well as the areas of engagement that are freehand constructions in GIS. Considering police work, there are already numerous scientific projects, publications, and software solutions designed to meet the specific requirements of Crime Analysis and Crime Mapping. By adapting and extending these methods and techniques, civil security research can be tailored to the needs of fire departments. In this paper, a selection of appropriate visualization methods will be presented and discussed.}, language = {en} } @article{VolandAsche2017, author = {Voland, Patrick and Asche, Hartmut}, title = {Processing and Visualizing Floating Car Data for Human-Centered Traffic and Environment Applications: A Transdisciplinary Approach}, series = {International journal of agricultural and environmental information systems : an official publication of the Information Resources Management Association}, volume = {8}, journal = {International journal of agricultural and environmental information systems : an official publication of the Information Resources Management Association}, publisher = {IGI Global}, address = {Hershey}, issn = {1947-3192}, doi = {10.4018/IJAEIS.2017040103}, pages = {32 -- 49}, year = {2017}, abstract = {In the era of the Internet of Things and Big Data modern cars have become mobile electronic systems or computers on wheels. Car sensors record a multitude of car and traffic related data as well as environmental parameters outside the vehicle. The data recorded are spatio-temporal by nature (floating car data) and can thus be classified as geodata. Their geospatial potential is, however, not fully exploited so far. In this paper, we present an approach to collect, process and visualize floating car data for traffic-and environment-related applications. It is demonstrated that cartographic visualization, in particular, is as effective means to make the enormous stocks of machine-recorded data available to human perception, exploration and analysis.}, language = {en} } @phdthesis{Richter2018, author = {Richter, Rico}, title = {Concepts and techniques for processing and rendering of massive 3D point clouds}, doi = {10.25932/publishup-42330}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-423304}, school = {Universit{\"a}t Potsdam}, pages = {v, 131}, year = {2018}, abstract = {Remote sensing technology, such as airborne, mobile, or terrestrial laser scanning, and photogrammetric techniques, are fundamental approaches for efficient, automatic creation of digital representations of spatial environments. For example, they allow us to generate 3D point clouds of landscapes, cities, infrastructure networks, and sites. As essential and universal category of geodata, 3D point clouds are used and processed by a growing number of applications, services, and systems such as in the domains of urban planning, landscape architecture, environmental monitoring, disaster management, virtual geographic environments as well as for spatial analysis and simulation. While the acquisition processes for 3D point clouds become more and more reliable and widely-used, applications and systems are faced with more and more 3D point cloud data. In addition, 3D point clouds, by their very nature, are raw data, i.e., they do not contain any structural or semantics information. Many processing strategies common to GIS such as deriving polygon-based 3D models generally do not scale for billions of points. GIS typically reduce data density and precision of 3D point clouds to cope with the sheer amount of data, but that results in a significant loss of valuable information at the same time. This thesis proposes concepts and techniques designed to efficiently store and process massive 3D point clouds. To this end, object-class segmentation approaches are presented to attribute semantics to 3D point clouds, used, for example, to identify building, vegetation, and ground structures and, thus, to enable processing, analyzing, and visualizing 3D point clouds in a more effective and efficient way. Similarly, change detection and updating strategies for 3D point clouds are introduced that allow for reducing storage requirements and incrementally updating 3D point cloud databases. In addition, this thesis presents out-of-core, real-time rendering techniques used to interactively explore 3D point clouds and related analysis results. All techniques have been implemented based on specialized spatial data structures, out-of-core algorithms, and GPU-based processing schemas to cope with massive 3D point clouds having billions of points. All proposed techniques have been evaluated and demonstrated their applicability to the field of geospatial applications and systems, in particular for tasks such as classification, processing, and visualization. Case studies for 3D point clouds of entire cities with up to 80 billion points show that the presented approaches open up new ways to manage and apply large-scale, dense, and time-variant 3D point clouds as required by a rapidly growing number of applications and systems.}, language = {en} } @phdthesis{Jaeger2018, author = {Jaeger, David}, title = {Enabling Big Data security analytics for advanced network attack detection}, doi = {10.25932/publishup-43571}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-435713}, school = {Universit{\"a}t Potsdam}, pages = {XVII, 201, XXXIII}, year = {2018}, abstract = {The last years have shown an increasing sophistication of attacks against enterprises. Traditional security solutions like firewalls, anti-virus systems and generally Intrusion Detection Systems (IDSs) are no longer sufficient to protect an enterprise against these advanced attacks. One popular approach to tackle this issue is to collect and analyze events generated across the IT landscape of an enterprise. This task is achieved by the utilization of Security Information and Event Management (SIEM) systems. However, the majority of the currently existing SIEM solutions is not capable of handling the massive volume of data and the diversity of event representations. Even if these solutions can collect the data at a central place, they are neither able to extract all relevant information from the events nor correlate events across various sources. Hence, only rather simple attacks are detected, whereas complex attacks, consisting of multiple stages, remain undetected. Undoubtedly, security operators of large enterprises are faced with a typical Big Data problem. In this thesis, we propose and implement a prototypical SIEM system named Real-Time Event Analysis and Monitoring System (REAMS) that addresses the Big Data challenges of event data with common paradigms, such as data normalization, multi-threading, in-memory storage, and distributed processing. In particular, a mostly stream-based event processing workflow is proposed that collects, normalizes, persists and analyzes events in near real-time. In this regard, we have made various contributions in the SIEM context. First, we propose a high-performance normalization algorithm that is highly parallelized across threads and distributed across nodes. Second, we are persisting into an in-memory database for fast querying and correlation in the context of attack detection. Third, we propose various analysis layers, such as anomaly- and signature-based detection, that run on top of the normalized and correlated events. As a result, we demonstrate our capabilities to detect previously known as well as unknown attack patterns. Lastly, we have investigated the integration of cyber threat intelligence (CTI) into the analytical process, for instance, for correlating monitored user accounts with previously collected public identity leaks to identify possible compromised user accounts. In summary, we show that a SIEM system can indeed monitor a large enterprise environment with a massive load of incoming events. As a result, complex attacks spanning across the whole network can be uncovered and mitigated, which is an advancement in comparison to existing SIEM systems on the market.}, language = {en} } @phdthesis{Shaabani2020, author = {Shaabani, Nuhad}, title = {On discovering and incrementally updating inclusion dependencies}, doi = {10.25932/publishup-47186}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-471862}, school = {Universit{\"a}t Potsdam}, pages = {119}, year = {2020}, abstract = {In today's world, many applications produce large amounts of data at an enormous rate. Analyzing such datasets for metadata is indispensable for effectively understanding, storing, querying, manipulating, and mining them. Metadata summarizes technical properties of a dataset which rang from basic statistics to complex structures describing data dependencies. One type of dependencies is inclusion dependency (IND), which expresses subset-relationships between attributes of datasets. Therefore, inclusion dependencies are important for many data management applications in terms of data integration, query optimization, schema redesign, or integrity checking. So, the discovery of inclusion dependencies in unknown or legacy datasets is at the core of any data profiling effort. For exhaustively detecting all INDs in large datasets, we developed S-indd++, a new algorithm that eliminates the shortcomings of existing IND-detection algorithms and significantly outperforms them. S-indd++ is based on a novel concept for the attribute clustering for efficiently deriving INDs. Inferring INDs from our attribute clustering eliminates all redundant operations caused by other algorithms. S-indd++ is also based on a novel partitioning strategy that enables discording a large number of candidates in early phases of the discovering process. Moreover, S-indd++ does not require to fit a partition into the main memory--this is a highly appreciable property in the face of ever-growing datasets. S-indd++ reduces up to 50\% of the runtime of the state-of-the-art approach. None of the approach for discovering INDs is appropriate for the application on dynamic datasets; they can not update the INDs after an update of the dataset without reprocessing it entirely. To this end, we developed the first approach for incrementally updating INDs in frequently changing datasets. We achieved that by reducing the problem of incrementally updating INDs to the incrementally updating the attribute clustering from which all INDs are efficiently derivable. We realized the update of the clusters by designing new operations to be applied to the clusters after every data update. The incremental update of INDs reduces the time of the complete rediscovery by up to 99.999\%. All existing algorithms for discovering n-ary INDs are based on the principle of candidate generation--they generate candidates and test their validity in the given data instance. The major disadvantage of this technique is the exponentially growing number of database accesses in terms of SQL queries required for validation. We devised Mind2, the first approach for discovering n-ary INDs without candidate generation. Mind2 is based on a new mathematical framework developed in this thesis for computing the maximum INDs from which all other n-ary INDs are derivable. The experiments showed that Mind2 is significantly more scalable and effective than hypergraph-based algorithms.}, language = {en} } @article{CaruccioDeufemiaNaumannetal.2021, author = {Caruccio, Loredana and Deufemia, Vincenzo and Naumann, Felix and Polese, Giuseppe}, title = {Discovering relaxed functional dependencies based on multi-attribute dominance}, series = {IEEE transactions on knowledge and data engineering}, volume = {33}, journal = {IEEE transactions on knowledge and data engineering}, number = {9}, publisher = {Institute of Electrical and Electronics Engineers}, address = {New York, NY}, issn = {1041-4347}, doi = {10.1109/TKDE.2020.2967722}, pages = {3212 -- 3228}, year = {2021}, abstract = {With the advent of big data and data lakes, data are often integrated from multiple sources. Such integrated data are often of poor quality, due to inconsistencies, errors, and so forth. One way to check the quality of data is to infer functional dependencies (fds). However, in many modern applications it might be necessary to extract properties and relationships that are not captured through fds, due to the necessity to admit exceptions, or to consider similarity rather than equality of data values. Relaxed fds (rfds) have been introduced to meet these needs, but their discovery from data adds further complexity to an already complex problem, also due to the necessity of specifying similarity and validity thresholds. We propose Domino, a new discovery algorithm for rfds that exploits the concept of dominance in order to derive similarity thresholds of attribute values while inferring rfds. An experimental evaluation on real datasets demonstrates the discovery performance and the effectiveness of the proposed algorithm.}, language = {en} } @article{RuedianVladova2021, author = {R{\"u}dian, Sylvio Leo and Vladova, Gergana}, title = {Kostenfreie Onlinekurse nachhaltig mit personalisiertem Marketing finanzieren}, series = {HMD Praxis der Wirtschaftsinformatik}, volume = {58}, journal = {HMD Praxis der Wirtschaftsinformatik}, number = {3}, publisher = {Springer Vieweg}, address = {Wiesbaden}, issn = {1436-3011}, doi = {10.1365/s40702-021-00720-4}, pages = {507 -- 520}, year = {2021}, abstract = {Selbstbestimmtes Lernen mit Onlinekursen findet zunehmend mehr Akzeptanz in unserer Gesellschaft. Lernende k{\"o}nnen mithilfe von Onlinekursen selbst festlegen, was sie wann lernen und Kurse k{\"o}nnen durch vielf{\"a}ltige Adaptionen an den Lernfortschritt der Nutzer angepasst und individualisiert werden. Auf der einen Seite ist eine große Zielgruppe f{\"u}r diese Lernangebote vorhanden. Auf der anderen Seite sind die Erstellung von Onlinekursen, ihre Bereitstellung, Wartung und Betreuung kostenintensiv, wodurch hochwertige Angebote h{\"a}ufig kostenpflichtig angeboten werden m{\"u}ssen, um als Anbieter zumindest kostenneutral agieren zu k{\"o}nnen. In diesem Beitrag er{\"o}rtern und diskutieren wir ein offenes, nachhaltiges datengetriebenes zweiseitiges Gesch{\"a}ftsmodell zur Verwertung gepr{\"u}fter Onlinekurse und deren kostenfreie Bereitstellung f{\"u}r jeden Lernenden. Kern des Gesch{\"a}ftsmodells ist die Nutzung der dabei entstehenden Verhaltensdaten, die daraus m{\"o}gliche Ableitung von Pers{\"o}nlichkeitsmerkmalen und Interessen und deren Nutzung im kommerziellen Kontext. Dies ist eine bei der Websuche bereits weitl{\"a}ufig akzeptierte Methode, welche nun auf den Lernkontext {\"u}bertragen wird. Welche M{\"o}glichkeiten, Herausforderungen, aber auch Barrieren {\"u}berwunden werden m{\"u}ssen, damit das Gesch{\"a}ftsmodell nachhaltig und ethisch vertretbar funktioniert, werden zwei unabh{\"a}ngige, jedoch synergetisch verbundene Gesch{\"a}ftsmodelle vorgestellt und diskutiert. Zus{\"a}tzlich wurde die Akzeptanz und Erwartung der Zielgruppe f{\"u}r das vorgestellte Gesch{\"a}ftsmodell untersucht, um notwendige Kernressourcen f{\"u}r die Praxis abzuleiten. Die Ergebnisse der Untersuchung zeigen, dass das Gesch{\"a}ftsmodell von den Nutzer*innen grundlegend akzeptiert wird. 10 \% der Befragten w{\"u}rden es bevorzugen, mit virtuellen Assistenten - anstelle mit Tutor*innen zu lernen. Zudem ist der Großteil der Nutzer*innen sich nicht dar{\"u}ber bewusst, dass Pers{\"o}nlichkeitsmerkmale anhand des Nutzerverhaltens abgeleitet werden k{\"o}nnen.}, language = {de} }