@misc{KruseKaoudiQuianeRuizetal.2019,
  author    = {Kruse, Sebastian and Kaoudi, Zoi and Quiane-Ruiz, Jorge-Arnulfo and Chawla, Sanjay and Naumann, Felix and Contreras-Rojas, Bertty},
  title     = {Optimizing Cross-Platform Data Movement},
  series = {2019 IEEE 35th International Conference on Data Engineering (ICDE)},
  journal   = {2019 IEEE 35th International Conference on Data Engineering (ICDE)},
  publisher = {IEEE},
  address   = {New York},
  isbn      = {978-1-5386-7474-1},
  issn      = {1084-4627},
  doi       = {10.1109/ICDE.2019.00162},
  pages     = {1642 -- 1645},
  year      = {2019},
  abstract  = {Data analytics are moving beyond the limits of a single data processing platform. A cross-platform query optimizer is necessary to enable applications to run their tasks over multiple platforms efficiently and in a platform-agnostic manner. For the optimizer to be effective, it must consider data movement costs across different data processing platforms. In this paper, we present the graph-based data movement strategy used by RHEEM, our open-source cross-platform system. In particular, we (i) model the data movement problem as a new graph problem, which we prove to be NP-hard, and (ii) propose a novel graph exploration algorithm, which allows RHEEM to discover multiple hidden opportunities for cross-platform data processing.},
  language  = {en}
}
@misc{KruseKaoudiContrerasRojasetal.2020,
  author    = {Kruse, Sebastian and Kaoudi, Zoi and Contreras-Rojas, Bertty and Chawla, Sanjay and Naumann, Felix and Quian{\´e}-Ruiz, Jorge-Arnulfo},
  title     = {RHEEMix in the data jungle},
  series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t},
  journal   = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t},
  number    = {6},
  doi       = {10.25932/publishup-51944},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-519443},
  pages     = {26},
  year      = {2020},
  abstract  = {Data analytics are moving beyond the limits of a single platform. In this paper, we present the cost-based optimizer of Rheem, an open-source cross-platform system that copes with these new requirements. The optimizer allocates the subtasks of data analytic tasks to the most suitable platforms. Our main contributions are: (i) a mechanism based on graph transformations to explore alternative execution strategies; (ii) a novel graph-based approach to determine efficient data movement plans among subtasks and platforms; and (iii) an efficient plan enumeration algorithm, based on a novel enumeration algebra. We extensively evaluate our optimizer under diverse real tasks. We show that our optimizer can perform tasks more than one order of magnitude faster when using multiple platforms than when using a single platform.},
  language  = {en}
}
@article{KruseKaoudiContrerasRojasetal.2020,
  author    = {Kruse, Sebastian and Kaoudi, Zoi and Contreras-Rojas, Bertty and Chawla, Sanjay and Naumann, Felix and Quiane-Ruiz, Jorge-Arnulfo},
  title     = {RHEEMix in the data jungle},
  series = {The VLDB Journal},
  volume    = {29},
  journal   = {The VLDB Journal},
  number    = {6},
  publisher = {Springer},
  address   = {Berlin},
  issn      = {1066-8888},
  doi       = {10.1007/s00778-020-00612-x},
  pages     = {1287 -- 1310},
  year      = {2020},
  abstract  = {Data analytics are moving beyond the limits of a single platform. In this paper, we present the cost-based optimizer of Rheem, an open-source cross-platform system that copes with these new requirements. The optimizer allocates the subtasks of data analytic tasks to the most suitable platforms. Our main contributions are: (i) a mechanism based on graph transformations to explore alternative execution strategies; (ii) a novel graph-based approach to determine efficient data movement plans among subtasks and platforms; and (iii) an efficient plan enumeration algorithm, based on a novel enumeration algebra. We extensively evaluate our optimizer under diverse real tasks. We show that our optimizer can perform tasks more than one order of magnitude faster when using multiple platforms than when using a single platform.},
  language  = {en}
}
@phdthesis{Kruse2018,
  author    = {Kruse, Sebastian},
  title     = {Scalable data profiling},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-412521},
  school      = {Universit{\"a}t Potsdam},
  pages     = {ii, 156},
  year      = {2018},
  abstract  = {Data profiling is the act of extracting structural metadata from datasets. Structural metadata, such as data dependencies and statistics, can support data management operations, such as data integration and data cleaning. Data management often is the most time-consuming activity in any data-related project. Its support is extremely valuable in our data-driven world, so that more time can be spent on the actual utilization of the data, e. g., building analytical models. In most scenarios, however, structural metadata is not given and must be extracted first. Therefore, efficient data profiling methods are highly desirable. Data profiling is a computationally expensive problem; in fact, most dependency discovery problems entail search spaces that grow exponentially in the number of attributes. To this end, this thesis introduces novel discovery algorithms for various types of data dependencies - namely inclusion dependencies, conditional inclusion dependencies, partial functional dependencies, and partial unique column combinations - that considerably improve over state-of-the-art algorithms in terms of efficiency and that scale to datasets that cannot be processed by existing algorithms. The key to those improvements are not only algorithmic innovations, such as novel pruning rules or traversal strategies, but also algorithm designs tailored for distributed execution. While distributed data profiling has been mostly neglected by previous works, it is a logical consequence on the face of recent hardware trends and the computational hardness of dependency discovery. To demonstrate the utility of data profiling for data management, this thesis furthermore presents Metacrate, a database for structural metadata. Its salient features are its flexible data model, the capability to integrate various kinds of structural metadata, and its rich metadata analytics library. We show how to perform a data anamnesis of unknown, complex datasets based on this technology. In particular, we describe in detail how to reconstruct the schemata and assess their quality as part of the data anamnesis. The data profiling algorithms and Metacrate have been carefully implemented, integrated with the Metanome data profiling tool, and are available as free software. In that way, we intend to allow for easy repeatability of our research results and also provide them for actual usage in real-world data-related projects.},
  language  = {en}
}