@article{JiangNaumann2020, author = {Jiang, Lan and Naumann, Felix}, title = {Holistic primary key and foreign key detection}, series = {Journal of intelligent information systems : JIIS}, volume = {54}, journal = {Journal of intelligent information systems : JIIS}, number = {3}, publisher = {Springer}, address = {Dordrecht}, issn = {0925-9902}, doi = {10.1007/s10844-019-00562-z}, pages = {439 -- 461}, year = {2020}, abstract = {Primary keys (PKs) and foreign keys (FKs) are important elements of relational schemata in various applications, such as query optimization and data integration. However, in many cases, these constraints are unknown or not documented. Detecting them manually is time-consuming and even infeasible in large-scale datasets. We study the problem of discovering primary keys and foreign keys automatically and propose an algorithm to detect both, namely Holistic Primary Key and Foreign Key Detection (HoPF). PKs and FKs are subsets of the sets of unique column combinations (UCCs) and inclusion dependencies (INDs), respectively, for which efficient discovery algorithms are known. Using score functions, our approach is able to effectively extract the true PKs and FKs from the vast sets of valid UCCs and INDs. Several pruning rules are employed to speed up the procedure. We evaluate precision and recall on three benchmarks and two real-world datasets. The results show that our method is able to retrieve on average 88\% of all primary keys, and 91\% of all foreign keys. We compare the performance of HoPF with two baseline approaches that both assume the existence of primary keys.}, language = {en} } @article{VitaglianoHameedJiangetal.2023, author = {Vitagliano, Gerardo and Hameed, Mazhar and Jiang, Lan and Reisener, Lucas and Wu, Eugene and Naumann, Felix}, title = {Pollock: a data loading benchmark}, series = {Proceedings of the VLDB Endowment}, volume = {16}, journal = {Proceedings of the VLDB Endowment}, number = {8}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3594512.3594518}, pages = {1870 -- 1882}, year = {2023}, abstract = {Any system at play in a data-driven project has a fundamental requirement: the ability to load data. The de-facto standard format to distribute and consume raw data is CSV. Yet, the plain text and flexible nature of this format make such files often difficult to parse and correctly load their content, requiring cumbersome data preparation steps. We propose a benchmark to assess the robustness of systems in loading data from non-standard CSV formats and with structural inconsistencies. First, we formalize a model to describe the issues that affect real-world files and use it to derive a systematic lpollutionz process to generate dialects for any given grammar. Our benchmark leverages the pollution framework for the csv format. To guide pollution, we have surveyed thousands of real-world, publicly available csv files, recording the problems we encountered. We demonstrate the applicability of our benchmark by testing and scoring 16 different systems: popular csv parsing frameworks, relational database tools, spreadsheet systems, and a data visualization tool.}, language = {en} }