@article{KossmannPapenbrockNaumann2021, author = {Koßmann, Jan and Papenbrock, Thorsten and Naumann, Felix}, title = {Data dependencies for query optimization}, series = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment}, volume = {31}, journal = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment}, number = {1}, publisher = {Springer}, address = {Berlin ; Heidelberg ; New York}, issn = {1066-8888}, doi = {10.1007/s00778-021-00676-3}, pages = {1 -- 22}, year = {2021}, abstract = {Effective query optimization is a core feature of any database management system. While most query optimization techniques make use of simple metadata, such as cardinalities and other basic statistics, other optimization techniques are based on more advanced metadata including data dependencies, such as functional, uniqueness, order, or inclusion dependencies. This survey provides an overview, intuitive descriptions, and classifications of query optimization and execution strategies that are enabled by data dependencies. We consider the most popular types of data dependencies and focus on optimization strategies that target the optimization of relational database queries. The survey supports database vendors to identify optimization opportunities as well as DBMS researchers to find related work and open research questions.}, language = {en} } @article{KruseKaoudiContrerasRojasetal.2020, author = {Kruse, Sebastian and Kaoudi, Zoi and Contreras-Rojas, Bertty and Chawla, Sanjay and Naumann, Felix and Quiane-Ruiz, Jorge-Arnulfo}, title = {RHEEMix in the data jungle}, series = {The VLDB Journal}, volume = {29}, journal = {The VLDB Journal}, number = {6}, publisher = {Springer}, address = {Berlin}, issn = {1066-8888}, doi = {10.1007/s00778-020-00612-x}, pages = {1287 -- 1310}, year = {2020}, abstract = {Data analytics are moving beyond the limits of a single platform. In this paper, we present the cost-based optimizer of Rheem, an open-source cross-platform system that copes with these new requirements. The optimizer allocates the subtasks of data analytic tasks to the most suitable platforms. Our main contributions are: (i) a mechanism based on graph transformations to explore alternative execution strategies; (ii) a novel graph-based approach to determine efficient data movement plans among subtasks and platforms; and (iii) an efficient plan enumeration algorithm, based on a novel enumeration algebra. We extensively evaluate our optimizer under diverse real tasks. We show that our optimizer can perform tasks more than one order of magnitude faster when using multiple platforms than when using a single platform.}, language = {en} }