@article{KossmannPapenbrockNaumann2021,
  author    = {Koßmann, Jan and Papenbrock, Thorsten and Naumann, Felix},
  title     = {Data dependencies for query optimization},
  series = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment},
  volume    = {31},
  journal   = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment},
  number    = {1},
  publisher = {Springer},
  address   = {Berlin ; Heidelberg ; New York},
  issn      = {1066-8888},
  doi       = {10.1007/s00778-021-00676-3},
  pages     = {1 -- 22},
  year      = {2021},
  abstract  = {Effective query optimization is a core feature of any database management system. While most query optimization techniques make use of simple metadata, such as cardinalities and other basic statistics, other optimization techniques are based on more advanced metadata including data dependencies, such as functional, uniqueness, order, or inclusion dependencies. This survey provides an overview, intuitive descriptions, and classifications of query optimization and execution strategies that are enabled by data dependencies. We consider the most popular types of data dependencies and focus on optimization strategies that target the optimization of relational database queries. The survey supports database vendors to identify optimization opportunities as well as DBMS researchers to find related work and open research questions.},
  language  = {en}
}
@article{CaruccioDeufemiaNaumannetal.2021,
  author    = {Caruccio, Loredana and Deufemia, Vincenzo and Naumann, Felix and Polese, Giuseppe},
  title     = {Discovering relaxed functional dependencies based on multi-attribute dominance},
  series = {IEEE transactions on knowledge and data engineering},
  volume    = {33},
  journal   = {IEEE transactions on knowledge and data engineering},
  number    = {9},
  publisher = {Institute of Electrical and Electronics Engineers},
  address   = {New York, NY},
  issn      = {1041-4347},
  doi       = {10.1109/TKDE.2020.2967722},
  pages     = {3212 -- 3228},
  year      = {2021},
  abstract  = {With the advent of big data and data lakes, data are often integrated from multiple sources. Such integrated data are often of poor quality, due to inconsistencies, errors, and so forth. One way to check the quality of data is to infer functional dependencies (fds). However, in many modern applications it might be necessary to extract properties and relationships that are not captured through fds, due to the necessity to admit exceptions, or to consider similarity rather than equality of data values. Relaxed fds (rfds) have been introduced to meet these needs, but their discovery from data adds further complexity to an already complex problem, also due to the necessity of specifying similarity and validity thresholds. We propose Domino, a new discovery algorithm for rfds that exploits the concept of dominance in order to derive similarity thresholds of attribute values while inferring rfds. An experimental evaluation on real datasets demonstrates the discovery performance and the effectiveness of the proposed algorithm.},
  language  = {en}
}
@phdthesis{Harmouch2020,
  author    = {Harmouch, Hazar},
  title     = {Single-column data profiling},
  doi       = {10.25932/publishup-47455},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-474554},
  school      = {Universit{\"a}t Potsdam},
  pages     = {x, 115},
  year      = {2020},
  abstract  = {The research area of data profiling consists of a large set of methods and processes to examine a given dataset and determine metadata about it. Typically, different data profiling tasks address different kinds of metadata, comprising either various statistics about individual columns (Single-column Analysis) or relationships among them (Dependency Discovery). Among the basic statistics about a column are data type, header, the number of unique values (the column's cardinality), maximum and minimum values, the number of null values, and the value distribution. Dependencies involve, for instance, functional dependencies (FDs), inclusion dependencies (INDs), and their approximate versions. Data profiling has a wide range of conventional use cases, namely data exploration, cleansing, and integration. The produced metadata is also useful for database management and schema reverse engineering. Data profiling has also more novel use cases, such as big data analytics. The generated metadata describes the structure of the data at hand, how to import it, what it is about, and how much of it there is. Thus, data profiling can be considered as an important preparatory task for many data analysis and mining scenarios to assess which data might be useful and to reveal and understand a new dataset's characteristics. In this thesis, the main focus is on the single-column analysis class of data profiling tasks. We study the impact and the extraction of three of the most important metadata about a column, namely the cardinality, the header, and the number of null values. First, we present a detailed experimental study of twelve cardinality estimation algorithms. We classify the algorithms and analyze their efficiency, scaling far beyond the original experiments and testing theoretical guarantees. Our results highlight their trade-offs and point out the possibility to create a parallel or a distributed version of these algorithms to cope with the growing size of modern datasets. Then, we present a fully automated, multi-phase system to discover human-understandable, representative, and consistent headers for a target table in cases where headers are missing, meaningless, or unrepresentative for the column values. Our evaluation on Wikipedia tables shows that 60\% of the automatically discovered schemata are exact and complete. Considering more schema candidates, top-5 for example, increases this percentage to 72\%. Finally, we formally and experimentally show the ghost and fake FDs phenomenon caused by FD discovery over datasets with missing values. We propose two efficient scores, probabilistic and likelihood-based, for estimating the genuineness of a discovered FD. Our extensive set of experiments on real-world and semi-synthetic datasets show the effectiveness and efficiency of these scores.},
  language  = {en}
}