@phdthesis{Dreseler2022, author = {Dreseler, Markus}, title = {Automatic tiering for in-memory database systems}, doi = {10.25932/publishup-55825}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-558253}, school = {Universit{\"a}t Potsdam}, pages = {vii, 143}, year = {2022}, abstract = {A decade ago, it became feasible to store multi-terabyte databases in main memory. These in-memory databases (IMDBs) profit from DRAM's low latency and high throughput as well as from the removal of costly abstractions used in disk-based systems, such as the buffer cache. However, as the DRAM technology approaches physical limits, scaling these databases becomes difficult. Non-volatile memory (NVM) addresses this challenge. This new type of memory is persistent, has more capacity than DRAM (4x), and does not suffer from its density-inhibiting limitations. Yet, as NVM has a higher latency (5-15x) and a lower throughput (0.35x), it cannot fully replace DRAM. IMDBs thus need to navigate the trade-off between the two memory tiers. We present a solution to this optimization problem. Leveraging information about access frequencies and patterns, our solution utilizes NVM's additional capacity while minimizing the associated access costs. Unlike buffer cache-based implementations, our tiering abstraction does not add any costs when reading data from DRAM. As such, it can act as a drop-in replacement for existing IMDBs. Our contributions are as follows: (1) As the foundation for our research, we present Hyrise, an open-source, columnar IMDB that we re-engineered and re-wrote from scratch. Hyrise enables realistic end-to-end benchmarks of SQL workloads and offers query performance which is competitive with other research and commercial systems. At the same time, Hyrise is easy to understand and modify as repeatedly demonstrated by its uses in research and teaching. (2) We present a novel memory management framework for different memory and storage tiers. By encapsulating the allocation and access methods of these tiers, we enable existing data structures to be stored on different tiers with no modifications to their implementation. Besides DRAM and NVM, we also support and evaluate SSDs and have made provisions for upcoming technologies such as disaggregated memory. (3) To identify the parts of the data that can be moved to (s)lower tiers with little performance impact, we present a tracking method that identifies access skew both in the row and column dimensions and that detects patterns within consecutive accesses. Unlike existing methods that have substantial associated costs, our access counters exhibit no identifiable overhead in standard benchmarks despite their increased accuracy. (4) Finally, we introduce a tiering algorithm that optimizes the data placement for a given memory budget. In the TPC-H benchmark, this allows us to move 90\% of the data to NVM while the throughput is reduced by only 10.8\% and the query latency is increased by 11.6\%. With this, we outperform approaches that ignore the workload's access skew and access patterns and increase the query latency by 20\% or more. Individually, our contributions provide novel approaches to current challenges in systems engineering and database research. Combining them allows IMDBs to scale past the limits of DRAM while continuing to profit from the benefits of in-memory computing.}, language = {en} } @phdthesis{Kossmann2023, author = {Koßmann, Jan}, title = {Unsupervised database optimization}, doi = {10.25932/publishup-58949}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-589490}, school = {Universit{\"a}t Potsdam}, pages = {xi, 203}, year = {2023}, abstract = {The amount of data stored in databases and the complexity of database workloads are ever- increasing. Database management systems (DBMSs) offer many configuration options, such as index creation or unique constraints, which must be adapted to the specific instance to efficiently process large volumes of data. Currently, such database optimization is complicated, manual work performed by highly skilled database administrators (DBAs). In cloud scenarios, manual database optimization even becomes infeasible: it exceeds the abilities of the best DBAs due to the enormous number of deployed DBMS instances (some providers maintain millions of instances), missing domain knowledge resulting from data privacy requirements, and the complexity of the configuration tasks. Therefore, we investigate how to automate the configuration of DBMSs efficiently with the help of unsupervised database optimization. While there are numerous configuration options, in this thesis, we focus on automatic index selection and the use of data dependencies, such as functional dependencies, for query optimization. Both aspects have an extensive performance impact and complement each other by approaching unsupervised database optimization from different perspectives. Our contributions are as follows: (1) we survey automated state-of-the-art index selection algorithms regarding various criteria, e.g., their support for index interaction. We contribute an extensible platform for evaluating the performance of such algorithms with industry-standard datasets and workloads. The platform is well-received by the community and has led to follow-up research. With our platform, we derive the strengths and weaknesses of the investigated algorithms. We conclude that existing solutions often have scalability issues and cannot quickly determine (near-)optimal solutions for large problem instances. (2) To overcome these limitations, we present two new algorithms. Extend determines (near-)optimal solutions with an iterative heuristic. It identifies the best index configurations for the evaluated benchmarks. Its selection runtimes are up to 10 times lower compared with other near-optimal approaches. SWIRL is based on reinforcement learning and delivers solutions instantly. These solutions perform within 3 \% of the optimal ones. Extend and SWIRL are available as open-source implementations. (3) Our index selection efforts are complemented by a mechanism that analyzes workloads to determine data dependencies for query optimization in an unsupervised fashion. We describe and classify 58 query optimization techniques based on functional, order, and inclusion dependencies as well as on unique column combinations. The unsupervised mechanism and three optimization techniques are implemented in our open-source research DBMS Hyrise. Our approach reduces the Join Order Benchmark's runtime by 26 \% and accelerates some TPC-DS queries by up to 58 times. Additionally, we have developed a cockpit for unsupervised database optimization that allows interactive experiments to build confidence in such automated techniques. In summary, our contributions improve the performance of DBMSs, support DBAs in their work, and enable them to contribute their time to other, less arduous tasks.}, language = {en} }