@phdthesis{Shaabani2020, author = {Shaabani, Nuhad}, title = {On discovering and incrementally updating inclusion dependencies}, doi = {10.25932/publishup-47186}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-471862}, school = {Universit{\"a}t Potsdam}, pages = {119}, year = {2020}, abstract = {In today's world, many applications produce large amounts of data at an enormous rate. Analyzing such datasets for metadata is indispensable for effectively understanding, storing, querying, manipulating, and mining them. Metadata summarizes technical properties of a dataset which rang from basic statistics to complex structures describing data dependencies. One type of dependencies is inclusion dependency (IND), which expresses subset-relationships between attributes of datasets. Therefore, inclusion dependencies are important for many data management applications in terms of data integration, query optimization, schema redesign, or integrity checking. So, the discovery of inclusion dependencies in unknown or legacy datasets is at the core of any data profiling effort. For exhaustively detecting all INDs in large datasets, we developed S-indd++, a new algorithm that eliminates the shortcomings of existing IND-detection algorithms and significantly outperforms them. S-indd++ is based on a novel concept for the attribute clustering for efficiently deriving INDs. Inferring INDs from our attribute clustering eliminates all redundant operations caused by other algorithms. S-indd++ is also based on a novel partitioning strategy that enables discording a large number of candidates in early phases of the discovering process. Moreover, S-indd++ does not require to fit a partition into the main memory--this is a highly appreciable property in the face of ever-growing datasets. S-indd++ reduces up to 50\% of the runtime of the state-of-the-art approach. None of the approach for discovering INDs is appropriate for the application on dynamic datasets; they can not update the INDs after an update of the dataset without reprocessing it entirely. To this end, we developed the first approach for incrementally updating INDs in frequently changing datasets. We achieved that by reducing the problem of incrementally updating INDs to the incrementally updating the attribute clustering from which all INDs are efficiently derivable. We realized the update of the clusters by designing new operations to be applied to the clusters after every data update. The incremental update of INDs reduces the time of the complete rediscovery by up to 99.999\%. All existing algorithms for discovering n-ary INDs are based on the principle of candidate generation--they generate candidates and test their validity in the given data instance. The major disadvantage of this technique is the exponentially growing number of database accesses in terms of SQL queries required for validation. We devised Mind2, the first approach for discovering n-ary INDs without candidate generation. Mind2 is based on a new mathematical framework developed in this thesis for computing the maximum INDs from which all other n-ary INDs are derivable. The experiments showed that Mind2 is significantly more scalable and effective than hypergraph-based algorithms.}, language = {en} } @misc{ShaabaniMeinel2018, author = {Shaabani, Nuhad and Meinel, Christoph}, title = {Improving the efficiency of inclusion dependency detection}, series = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, journal = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, publisher = {Association for Computing Machinery}, address = {New York}, isbn = {978-1-4503-6014-2}, doi = {10.1145/3269206.3271724}, pages = {207 -- 216}, year = {2018}, abstract = {The detection of all inclusion dependencies (INDs) in an unknown dataset is at the core of any data profiling effort. Apart from the discovery of foreign key relationships, INDs can help perform data integration, integrity checking, schema (re-)design, and query optimization. With the advent of Big Data, the demand increases for efficient INDs discovery algorithms that can scale with the input data size. To this end, we propose S-INDD++ as a scalable system for detecting unary INDs in large datasets. S-INDD++ applies a new stepwise partitioning technique that helps discard a large number of attributes in early phases of the detection by processing the first partitions of smaller sizes. S-INDD++ also extends the concept of the attribute clustering to decide which attributes to be discarded based on the clustering result of each partition. Moreover, in contrast to the state-of-the-art, S-INDD++ does not require the partition to fit into the main memory-which is a highly appreciable property in the face of the ever growing datasets. We conducted an exhaustive evaluation of S-INDD++ by applying it to large datasets with thousands attributes and more than 266 million tuples. The results show the high superiority of S-INDD++ over the state-of-the-art. S-INDD++ reduced up to 50 \% of the runtime in comparison with BINDER, and up to 98 \% in comparison with S-INDD.}, language = {en} }