@book{DraisbachNaumannSzottetal.2012, author = {Draisbach, Uwe and Naumann, Felix and Szott, Sascha and Wonneberg, Oliver}, title = {Adaptive windows for duplicate detection}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-143-1}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53007}, publisher = {Universit{\"a}t Potsdam}, pages = {41}, year = {2012}, abstract = {Duplicate detection is the task of identifying all groups of records within a data set that represent the same real-world entity, respectively. This task is difficult, because (i) representations might differ slightly, so some similarity measure must be defined to compare pairs of records and (ii) data sets might have a high volume making a pair-wise comparison of all records infeasible. To tackle the second problem, many algorithms have been suggested that partition the data set and compare all record pairs only within each partition. One well-known such approach is the Sorted Neighborhood Method (SNM), which sorts the data according to some key and then advances a window over the data comparing only records that appear within the same window. We propose several variations of SNM that have in common a varying window size and advancement. The general intuition of such adaptive windows is that there might be regions of high similarity suggesting a larger window size and regions of lower similarity suggesting a smaller window size. We propose and thoroughly evaluate several adaption strategies, some of which are provably better than the original SNM in terms of efficiency (same results with fewer comparisons).}, language = {en} } @book{AlbrechtNaumann2012, author = {Albrecht, Alexander and Naumann, Felix}, title = {Understanding cryptic schemata in large extract-transform-load systems}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-201-8}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-61257}, publisher = {Universit{\"a}t Potsdam}, pages = {19}, year = {2012}, abstract = {Extract-Transform-Load (ETL) tools are used for the creation, maintenance, and evolution of data warehouses, data marts, and operational data stores. ETL workflows populate those systems with data from various data sources by specifying and executing a DAG of transformations. Over time, hundreds of individual workflows evolve as new sources and new requirements are integrated into the system. The maintenance and evolution of large-scale ETL systems requires much time and manual effort. A key problem is to understand the meaning of unfamiliar attribute labels in source and target databases and ETL transformations. Hard-to-understand attribute labels lead to frustration and time spent to develop and understand ETL workflows. We present a schema decryption technique to support ETL developers in understanding cryptic schemata of sources, targets, and ETL transformations. For a given ETL system, our recommender-like approach leverages the large number of mapped attribute labels in existing ETL workflows to produce good and meaningful decryptions. In this way we are able to decrypt attribute labels consisting of a number of unfamiliar few-letter abbreviations, such as UNP_PEN_INT, which we can decrypt to UNPAID_PENALTY_INTEREST. We evaluate our schema decryption approach on three real-world repositories of ETL workflows and show that our approach is able to suggest high-quality decryptions for cryptic attribute labels in a given schema.}, language = {en} } @book{BauckmannAbedjanLeseretal.2012, author = {Bauckmann, Jana and Abedjan, Ziawasch and Leser, Ulf and M{\"u}ller, Heiko and Naumann, Felix}, title = {Covering or complete? : Discovering conditional inclusion dependencies}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-212-4}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-62089}, publisher = {Universit{\"a}t Potsdam}, pages = {34}, year = {2012}, abstract = {Data dependencies, or integrity constraints, are used to improve the quality of a database schema, to optimize queries, and to ensure consistency in a database. In the last years conditional dependencies have been introduced to analyze and improve data quality. In short, a conditional dependency is a dependency with a limited scope defined by conditions over one or more attributes. Only the matching part of the instance must adhere to the dependency. In this paper we focus on conditional inclusion dependencies (CINDs). We generalize the definition of CINDs, distinguishing covering and completeness conditions. We present a new use case for such CINDs showing their value for solving complex data quality tasks. Further, we define quality measures for conditions inspired by precision and recall. We propose efficient algorithms that identify covering and completeness conditions conforming to given quality thresholds. Our algorithms choose not only the condition values but also the condition attributes automatically. Finally, we show that our approach efficiently provides meaningful and helpful results for our use case.}, language = {en} } @book{HerschelNaumann2008, author = {Herschel, Melanie and Naumann, Felix}, title = {Space and time scalability of duplicate detection in graph data}, isbn = {978-3-940793-46-1}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-32851}, publisher = {Universit{\"a}t Potsdam}, year = {2008}, abstract = {Duplicate detection consists in determining different representations of real-world objects in a database. Recent research has considered the use of relationships among object representations to improve duplicate detection. In the general case where relationships form a graph, research has mainly focused on duplicate detection quality/effectiveness. Scalability has been neglected so far, even though it is crucial for large real-world duplicate detection tasks. In this paper we scale up duplicate detection in graph data (DDG) to large amounts of data and pairwise comparisons, using the support of a relational database system. To this end, we first generalize the process of DDG. We then present how to scale algorithms for DDG in space (amount of data processed with limited main memory) and in time. Finally, we explore how complex similarity computation can be performed efficiently. Experiments on data an order of magnitude larger than data considered so far in DDG clearly show that our methods scale to large amounts of data not residing in main memory.}, language = {en} } @book{LangeBoehmNaumann2010, author = {Lange, Dustin and B{\"o}hm, Christoph and Naumann, Felix}, title = {Extracting structured information from Wikipedia articles to populate infoboxes}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-081-6}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-45714}, publisher = {Universit{\"a}t Potsdam}, pages = {27}, year = {2010}, abstract = {Roughly every third Wikipedia article contains an infobox - a table that displays important facts about the subject in attribute-value form. The schema of an infobox, i.e., the attributes that can be expressed for a concept, is defined by an infobox template. Often, authors do not specify all template attributes, resulting in incomplete infoboxes. With iPopulator, we introduce a system that automatically populates infoboxes of Wikipedia articles by extracting attribute values from the article's text. In contrast to prior work, iPopulator detects and exploits the structure of attribute values for independently extracting value parts. We have tested iPopulator on the entire set of infobox templates and provide a detailed analysis of its effectiveness. For instance, we achieve an average extraction precision of 91\% for 1,727 distinct infobox template attributes.}, language = {en} } @book{BauckmannLeserNaumann2010, author = {Bauckmann, Jana and Leser, Ulf and Naumann, Felix}, title = {Efficient and exact computation of inclusion dependencies for data integration}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-048-9}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-41396}, publisher = {Universit{\"a}t Potsdam}, pages = {36}, year = {2010}, abstract = {Data obtained from foreign data sources often come with only superficial structural information, such as relation names and attribute names. Other types of metadata that are important for effective integration and meaningful querying of such data sets are missing. In particular, relationships among attributes, such as foreign keys, are crucial metadata for understanding the structure of an unknown database. The discovery of such relationships is difficult, because in principle for each pair of attributes in the database each pair of data values must be compared. A precondition for a foreign key is an inclusion dependency (IND) between the key and the foreign key attributes. We present with Spider an algorithm that efficiently finds all INDs in a given relational database. It leverages the sorting facilities of DBMS but performs the actual comparisons outside of the database to save computation. Spider analyzes very large databases up to an order of magnitude faster than previous approaches. We also evaluate in detail the effectiveness of several heuristics to reduce the number of necessary comparisons. Furthermore, we generalize Spider to find composite INDs covering multiple attributes, and partial INDs, which are true INDs for all but a certain number of values. This last type is particularly relevant when integrating dirty data as is often the case in the life sciences domain - our driving motivation.}, language = {en} } @book{AbedjanNaumann2011, author = {Abedjan, Ziawasch and Naumann, Felix}, title = {Advancing the discovery of unique column combinations}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-148-6}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53564}, publisher = {Universit{\"a}t Potsdam}, pages = {25}, year = {2011}, abstract = {Unique column combinations of a relational database table are sets of columns that contain only unique values. Discovering such combinations is a fundamental research problem and has many different data management and knowledge discovery applications. Existing discovery algorithms are either brute force or have a high memory load and can thus be applied only to small datasets or samples. In this paper, the wellknown GORDIAN algorithm and "Apriori-based" algorithms are compared and analyzed for further optimization. We greatly improve the Apriori algorithms through efficient candidate generation and statistics-based pruning methods. A hybrid solution HCAGORDIAN combines the advantages of GORDIAN and our new algorithm HCA, and it significantly outperforms all previous work in many situations.}, language = {en} } @book{MeinelPlattnerDoellneretal.2014, author = {Meinel, Christoph and Plattner, Hasso and D{\"o}llner, J{\"u}rgen Roland Friedrich and Weske, Mathias and Polze, Andreas and Hirschfeld, Robert and Naumann, Felix and Giese, Holger and Baudisch, Patrick}, title = {Proceedings of the 7th Ph.D. Retreat of the HPI Research School on Service-oriented Systems Engineering}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-273-5}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-63490}, publisher = {Universit{\"a}t Potsdam}, pages = {ii, 218}, year = {2014}, abstract = {Design and Implementation of service-oriented architectures imposes a huge number of research questions from the fields of software engineering, system analysis and modeling, adaptability, and application integration. Component orientation and web services are two approaches for design and realization of complex web-based system. Both approaches allow for dynamic application adaptation as well as integration of enterprise application. Commonly used technologies, such as J2EE and .NET, form de facto standards for the realization of complex distributed systems. Evolution of component systems has lead to web services and service-based architectures. This has been manifested in a multitude of industry standards and initiatives such as XML, WSDL UDDI, SOAP, etc. All these achievements lead to a new and promising paradigm in IT systems engineering which proposes to design complex software solutions as collaboration of contractually defined software services. Service-Oriented Systems Engineering represents a symbiosis of best practices in object-orientation, component-based development, distributed computing, and business process management. It provides integration of business and IT concerns. The annual Ph.D. Retreat of the Research School provides each member the opportunity to present his/her current state of their research and to give an outline of a prospective Ph.D. thesis. Due to the interdisciplinary structure of the Research Scholl, this technical report covers a wide range of research topics. These include but are not limited to: Self-Adaptive Service-Oriented Systems, Operating System Support for Service-Oriented Systems, Architecture and Modeling of Service-Oriented Systems, Adaptive Process Management, Services Composition and Workflow Planning, Security Engineering of Service-Based IT Systems, Quantitative Analysis and Optimization of Service-Oriented Systems, Service-Oriented Systems in 3D Computer Graphics sowie Service-Oriented Geoinformatics.}, language = {en} } @book{MeinelDoellnerWeskeetal.2021, author = {Meinel, Christoph and D{\"o}llner, J{\"u}rgen Roland Friedrich and Weske, Mathias and Polze, Andreas and Hirschfeld, Robert and Naumann, Felix and Giese, Holger and Baudisch, Patrick and Friedrich, Tobias and B{\"o}ttinger, Erwin and Lippert, Christoph and D{\"o}rr, Christian and Lehmann, Anja and Renard, Bernhard and Rabl, Tilmann and Uebernickel, Falk and Arnrich, Bert and H{\"o}lzle, Katharina}, title = {Proceedings of the HPI Research School on Service-oriented Systems Engineering 2020 Fall Retreat}, number = {138}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-513-2}, issn = {1613-5652}, doi = {10.25932/publishup-50413}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-504132}, publisher = {Universit{\"a}t Potsdam}, pages = {vi, 144}, year = {2021}, abstract = {Design and Implementation of service-oriented architectures imposes a huge number of research questions from the fields of software engineering, system analysis and modeling, adaptability, and application integration. Component orientation and web services are two approaches for design and realization of complex web-based system. Both approaches allow for dynamic application adaptation as well as integration of enterprise application. Service-Oriented Systems Engineering represents a symbiosis of best practices in object-orientation, component-based development, distributed computing, and business process management. It provides integration of business and IT concerns. The annual Ph.D. Retreat of the Research School provides each member the opportunity to present his/her current state of their research and to give an outline of a prospective Ph.D. thesis. Due to the interdisciplinary structure of the research school, this technical report covers a wide range of topics. These include but are not limited to: Human Computer Interaction and Computer Vision as Service; Service-oriented Geovisualization Systems; Algorithm Engineering for Service-oriented Systems; Modeling and Verification of Self-adaptive Service-oriented Systems; Tools and Methods for Software Engineering in Service-oriented Systems; Security Engineering of Service-based IT Systems; Service-oriented Information Systems; Evolutionary Transition of Enterprise Applications to Service Orientation; Operating System Abstractions for Service-oriented Computing; and Services Specification, Composition, and Enactment.}, language = {en} }