@phdthesis{Neumann2013, author = {Neumann, Stefan}, title = {Modular timing analysis of component-based real-time embedded systems}, address = {Potsdam}, pages = {218 S.}, year = {2013}, language = {en} } @phdthesis{Yang2013, author = {Yang, Haojin}, title = {Automatic video indexing and retrieval using video ocr technology}, address = {Potsdam}, pages = {182 S.}, year = {2013}, language = {en} } @phdthesis{Alsadeh2013, author = {Alsadeh, Ahmad}, title = {Augmented secure neighbor discovery: aligning security, privacy and usability}, address = {Potsdam}, pages = {114 S.}, year = {2013}, language = {en} } @phdthesis{Phusanga2013, author = {Phusanga, Dara}, title = {Derived algebraic systems}, address = {Potsdam}, pages = {81 S.}, year = {2013}, language = {en} } @phdthesis{Schaffner2013, author = {Schaffner, Jan}, title = {Multi tenancy for cloud-based in-memory column databases : workload management and data placement}, address = {Potsdam}, pages = {135 S.}, year = {2013}, language = {en} } @phdthesis{Albrecht2013, author = {Albrecht, Alexander}, title = {Understanding and managing extract-transform-load systems}, pages = {107}, year = {2013}, language = {en} } @phdthesis{Haider2013, author = {Haider, Peter}, title = {Prediction with Mixture Models}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-69617}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {Learning a model for the relationship between the attributes and the annotated labels of data examples serves two purposes. Firstly, it enables the prediction of the label for examples without annotation. Secondly, the parameters of the model can provide useful insights into the structure of the data. If the data has an inherent partitioned structure, it is natural to mirror this structure in the model. Such mixture models predict by combining the individual predictions generated by the mixture components which correspond to the partitions in the data. Often the partitioned structure is latent, and has to be inferred when learning the mixture model. Directly evaluating the accuracy of the inferred partition structure is, in many cases, impossible because the ground truth cannot be obtained for comparison. However it can be assessed indirectly by measuring the prediction accuracy of the mixture model that arises from it. This thesis addresses the interplay between the improvement of predictive accuracy by uncovering latent cluster structure in data, and further addresses the validation of the estimated structure by measuring the accuracy of the resulting predictive model. In the application of filtering unsolicited emails, the emails in the training set are latently clustered into advertisement campaigns. Uncovering this latent structure allows filtering of future emails with very low false positive rates. In order to model the cluster structure, a Bayesian clustering model for dependent binary features is developed in this thesis. Knowing the clustering of emails into campaigns can also aid in uncovering which emails have been sent on behalf of the same network of captured hosts, so-called botnets. This association of emails to networks is another layer of latent clustering. Uncovering this latent structure allows service providers to further increase the accuracy of email filtering and to effectively defend against distributed denial-of-service attacks. To this end, a discriminative clustering model is derived in this thesis that is based on the graph of observed emails. The partitionings inferred using this model are evaluated through their capacity to predict the campaigns of new emails. Furthermore, when classifying the content of emails, statistical information about the sending server can be valuable. Learning a model that is able to make use of it requires training data that includes server statistics. In order to also use training data where the server statistics are missing, a model that is a mixture over potentially all substitutions thereof is developed. Another application is to predict the navigation behavior of the users of a website. Here, there is no a priori partitioning of the users into clusters, but to understand different usage scenarios and design different layouts for them, imposing a partitioning is necessary. The presented approach simultaneously optimizes the discriminative as well as the predictive power of the clusters. Each model is evaluated on real-world data and compared to baseline methods. The results show that explicitly modeling the assumptions about the latent cluster structure leads to improved predictions compared to the baselines. It is beneficial to incorporate a small number of hyperparameters that can be tuned to yield the best predictions in cases where the prediction accuracy can not be optimized directly.}, language = {en} } @phdthesis{Boehm2013, author = {B{\"o}hm, Christoph}, title = {Enriching the Web of Data with topics and links}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-68624}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {This thesis presents novel ideas and research findings for the Web of Data - a global data space spanning many so-called Linked Open Data sources. Linked Open Data adheres to a set of simple principles to allow easy access and reuse for data published on the Web. Linked Open Data is by now an established concept and many (mostly academic) publishers adopted the principles building a powerful web of structured knowledge available to everybody. However, so far, Linked Open Data does not yet play a significant role among common web technologies that currently facilitate a high-standard Web experience. In this work, we thoroughly discuss the state-of-the-art for Linked Open Data and highlight several shortcomings - some of them we tackle in the main part of this work. First, we propose a novel type of data source meta-information, namely the topics of a dataset. This information could be published with dataset descriptions and support a variety of use cases, such as data source exploration and selection. For the topic retrieval, we present an approach coined Annotated Pattern Percolation (APP), which we evaluate with respect to topics extracted from Wikipedia portals. Second, we contribute to entity linking research by presenting an optimization model for joint entity linking, showing its hardness, and proposing three heuristics implemented in the LINked Data Alignment (LINDA) system. Our first solution can exploit multi-core machines, whereas the second and third approach are designed to run in a distributed shared-nothing environment. We discuss and evaluate the properties of our approaches leading to recommendations which algorithm to use in a specific scenario. The distributed algorithms are among the first of their kind, i.e., approaches for joint entity linking in a distributed fashion. Also, we illustrate that we can tackle the entity linking problem on the very large scale with data comprising more than 100 millions of entity representations from very many sources. Finally, we approach a sub-problem of entity linking, namely the alignment of concepts. We again target a method that looks at the data in its entirety and does not neglect existing relations. Also, this concept alignment method shall execute very fast to serve as a preprocessing for further computations. Our approach, called Holistic Concept Matching (HCM), achieves the required speed through grouping the input by comparing so-called knowledge representations. Within the groups, we perform complex similarity computations, relation conclusions, and detect semantic contradictions. The quality of our result is again evaluated on a large and heterogeneous dataset from the real Web. In summary, this work contributes a set of techniques for enhancing the current state of the Web of Data. All approaches have been tested on large and heterogeneous real-world input.}, language = {en} } @phdthesis{Dawoud2013, author = {Dawoud, Wesam}, title = {Scalability and performance management of internet applications in the cloud}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-68187}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {Cloud computing is a model for enabling on-demand access to a shared pool of computing resources. With virtually limitless on-demand resources, a cloud environment enables the hosted Internet application to quickly cope when there is an increase in the workload. However, the overhead of provisioning resources exposes the Internet application to periods of under-provisioning and performance degradation. Moreover, the performance interference, due to the consolidation in the cloud environment, complicates the performance management of the Internet applications. In this dissertation, we propose two approaches to mitigate the impact of the resources provisioning overhead. The first approach employs control theory to scale resources vertically and cope fast with workload. This approach assumes that the provider has knowledge and control over the platform running in the virtual machines (VMs), which limits it to Platform as a Service (PaaS) and Software as a Service (SaaS) providers. The second approach is a customer-side one that deals with the horizontal scalability in an Infrastructure as a Service (IaaS) model. It addresses the trade-off problem between cost and performance with a multi-goal optimization solution. This approach finds the scale thresholds that achieve the highest performance with the lowest increase in the cost. Moreover, the second approach employs a proposed time series forecasting algorithm to scale the application proactively and avoid under-utilization periods. Furthermore, to mitigate the interference impact on the Internet application performance, we developed a system which finds and eliminates the VMs suffering from performance interference. The developed system is a light-weight solution which does not imply provider involvement. To evaluate our approaches and the designed algorithms at large-scale level, we developed a simulator called (ScaleSim). In the simulator, we implemented scalability components acting as the scalability components of Amazon EC2. The current scalability implementation in Amazon EC2 is used as a reference point for evaluating the improvement in the scalable application performance. ScaleSim is fed with realistic models of the RUBiS benchmark extracted from the real environment. The workload is generated from the access logs of the 1998 world cup website. The results show that optimizing the scalability thresholds and adopting proactive scalability can mitigate 88\% of the resources provisioning overhead impact with only a 9\% increase in the cost.}, language = {en} } @phdthesis{Scheffler2013, author = {Scheffler, Thomas}, title = {Privacy enforcement with data owner-defined policies}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-67939}, school = {Universit{\"a}t Potsdam}, year = {2013}, abstract = {This thesis proposes a privacy protection framework for the controlled distribution and use of personal private data. The framework is based on the idea that privacy policies can be set directly by the data owner and can be automatically enforced against the data user. Data privacy continues to be a very important topic, as our dependency on electronic communication maintains its current growth, and private data is shared between multiple devices, users and locations. The growing amount and the ubiquitous availability of personal private data increases the likelihood of data misuse. Early privacy protection techniques, such as anonymous email and payment systems have focused on data avoidance and anonymous use of services. They did not take into account that data sharing cannot be avoided when people participate in electronic communication scenarios that involve social interactions. This leads to a situation where data is shared widely and uncontrollably and in most cases the data owner has no control over further distribution and use of personal private data. Previous efforts to integrate privacy awareness into data processing workflows have focused on the extension of existing access control frameworks with privacy aware functions or have analysed specific individual problems such as the expressiveness of policy languages. So far, very few implementations of integrated privacy protection mechanisms exist and can be studied to prove their effectiveness for privacy protection. Second level issues that stem from practical application of the implemented mechanisms, such as usability, life-time data management and changes in trustworthiness have received very little attention so far, mainly because they require actual implementations to be studied. Most existing privacy protection schemes silently assume that it is the privilege of the data user to define the contract under which personal private data is released. Such an approach simplifies policy management and policy enforcement for the data user, but leaves the data owner with a binary decision to submit or withhold his or her personal data based on the provided policy. We wanted to empower the data owner to express his or her privacy preferences through privacy policies that follow the so-called Owner-Retained Access Control (ORAC) model. ORAC has been proposed by McCollum, et al. as an alternate access control mechanism that leaves the authority over access decisions by the originator of the data. The data owner is given control over the release policy for his or her personal data, and he or she can set permissions or restrictions according to individually perceived trust values. Such a policy needs to be expressed in a coherent way and must allow the deterministic policy evaluation by different entities. The privacy policy also needs to be communicated from the data owner to the data user, so that it can be enforced. Data and policy are stored together as a Protected Data Object that follows the Sticky Policy paradigm as defined by Mont, et al. and others. We developed a unique policy combination approach that takes usability aspects for the creation and maintenance of policies into consideration. Our privacy policy consists of three parts: A Default Policy provides basic privacy protection if no specific rules have been entered by the data owner. An Owner Policy part allows the customisation of the default policy by the data owner. And a so-called Safety Policy guarantees that the data owner cannot specify disadvantageous policies, which, for example, exclude him or her from further access to the private data. The combined evaluation of these three policy-parts yields the necessary access decision. The automatic enforcement of privacy policies in our protection framework is supported by a reference monitor implementation. We started our work with the development of a client-side protection mechanism that allows the enforcement of data-use restrictions after private data has been released to the data user. The client-side enforcement component for data-use policies is based on a modified Java Security Framework. Privacy policies are translated into corresponding Java permissions that can be automatically enforced by the Java Security Manager. When we later extended our work to implement server-side protection mechanisms, we found several drawbacks for the privacy enforcement through the Java Security Framework. We solved this problem by extending our reference monitor design to use Aspect-Oriented Programming (AOP) and the Java Reflection API to intercept data accesses in existing applications and provide a way to enforce data owner-defined privacy policies for business applications.}, language = {en} }