@book{DraisbachNaumannSzottetal.2012, author = {Draisbach, Uwe and Naumann, Felix and Szott, Sascha and Wonneberg, Oliver}, title = {Adaptive windows for duplicate detection}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-143-1}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53007}, publisher = {Universit{\"a}t Potsdam}, pages = {41}, year = {2012}, abstract = {Duplicate detection is the task of identifying all groups of records within a data set that represent the same real-world entity, respectively. This task is difficult, because (i) representations might differ slightly, so some similarity measure must be defined to compare pairs of records and (ii) data sets might have a high volume making a pair-wise comparison of all records infeasible. To tackle the second problem, many algorithms have been suggested that partition the data set and compare all record pairs only within each partition. One well-known such approach is the Sorted Neighborhood Method (SNM), which sorts the data according to some key and then advances a window over the data comparing only records that appear within the same window. We propose several variations of SNM that have in common a varying window size and advancement. The general intuition of such adaptive windows is that there might be regions of high similarity suggesting a larger window size and regions of lower similarity suggesting a smaller window size. We propose and thoroughly evaluate several adaption strategies, some of which are provably better than the original SNM in terms of efficiency (same results with fewer comparisons).}, language = {en} } @book{AbedjanNaumann2011, author = {Abedjan, Ziawasch and Naumann, Felix}, title = {Advancing the discovery of unique column combinations}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-148-6}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53564}, publisher = {Universit{\"a}t Potsdam}, pages = {25}, year = {2011}, abstract = {Unique column combinations of a relational database table are sets of columns that contain only unique values. Discovering such combinations is a fundamental research problem and has many different data management and knowledge discovery applications. Existing discovery algorithms are either brute force or have a high memory load and can thus be applied only to small datasets or samples. In this paper, the wellknown GORDIAN algorithm and "Apriori-based" algorithms are compared and analyzed for further optimization. We greatly improve the Apriori algorithms through efficient candidate generation and statistics-based pruning methods. A hybrid solution HCAGORDIAN combines the advantages of GORDIAN and our new algorithm HCA, and it significantly outperforms all previous work in many situations.}, language = {en} } @article{AbramowskiAceroAharonianetal.2013, author = {Abramowski, Attila and Acero, F. and Aharonian, Felix A. and Benkhali, Faical Ait and Akhperjanian, A. G. and Ang{\"u}ner, Ekrem Oǧuzhan and Anton, Gisela and Balenderan, Shangkari and Balzer, Arnim and Barnacka, Anna and Becherini, Yvonne and Tjus, J. Becker and Bernl{\"o}hr, K. and Birsin, E. and Bissaldi, E. and Biteau, Jonathan and Boisson, Catherine and Bolmont, J. and Bordas, Pol and Brucker, J. and Brun, Francois and Brun, Pierre and Bulik, Tomasz and Carrigan, Svenja and Casanova, Sabrina and Cerruti, M. and Chadwick, Paula M. and Chalme-Calvet, R. and Chaves, Ryan C. G. and Cheesebrough, A. and Chretien, M. and Colafrancesco, Sergio and Cologna, Gabriele and Conrad, Jan and Couturier, C. and Dalton, M. and Daniel, M. K. and Davids, I. D. and Degrange, B. and Deil, C. and deWilt, P. and Dickinson, H. J. and Djannati-Ata{\"i}, A. and Domainko, W. and Drury, L. O'C. and Dubus, G. and Dutson, K. and Dyks, J. and Dyrda, M. and Edwards, T. and Egberts, Kathrin and Eger, P. and Espigat, P. and Farnier, C. and Fegan, S. and Feinstein, F. and Fernandes, M. V. and Fernandez, D. and Fiasson, A. and Fontaine, G. and Foerster, A. and Fuessling, M. and Gajdus, M. and Gallant, Y. A. and Garrigoux, T. and Gast, H. and Giebels, B. and Glicenstein, J. F. and Goering, D. and Grondin, M. -H. and Grudzinska, M. and Haeffner, S. and Hague, J. D. and Hahn, J. and Harris, J. and Heinzelmann, G. and Henri, G. and Hermann, G. and Hervet, O. and Hillert, A. and Hinton, James Anthony and Hofmann, W. and Hofverberg, P. and Holler, Markus and Horns, D. and Jacholkowska, A. and Jahn, C. and Jamrozy, M. and Janiak, M. and Jankowsky, F. and Jung, I. and Kastendieck, M. A. and Katarzynski, K. and Katz, U. and Kaufmann, S. and Khelifi, B. and Kieffer, M. and Klepser, S. and Klochkov, D. and Kluzniak, W. and Kneiske, T. and Kolitzus, D. and Komin, Nu. and Kosack, K. and Krakau, S. and Krayzel, F. and Krueger, P. P. and Laffon, H. and Lamanna, G. and Lefaucheur, J. and Lemoine-Goumard, M. and Lenain, J-P. and Lennarz, D. and Lohse, T. and Lopatin, A. and Lu, C-C. and Marandon, V. and Marcowith, Alexandre and Marx, R. and Maurin, G. and Maxted, N. and Mayer, M. and McComb, T. J. L. and Medina, M. C. and Mehault, J. and Menzler, U. and Meyer, M. and Moderski, R. and Mohamed, M. and Moulin, Emmanuel and Murach, T. and Naumann, C. L. and de Naurois, M. and Nedbal, D. and Niemiec, J. and Nolan, S. J. and Oakes, L. and Ohm, S. and Wilhelmi, E. de Ona and Opitz, B. and Ostrowski, M. and Oya, I. and Panter, M. and Parsons, R. D. and Arribas, M. Paz and Pekeur, N. W. and Pelletier, G. and Perez, J. and Petrucci, P-O. and Peyaud, B. and Pita, S. and Poon, H. and Puehlhofer, G. and Punch, M. and Quirrenbach, A. and Raab, S. and Raue, M. and Reimer, A. and Reimer, O. and Renaud, M. and de los Reyes, R. and Rieger, F. and Rob, L. and Rosier-Lees, S. and Rowell, G. and Rudak, B. and Rulten, C. B. and Sahakian, V. and Sanchez, David M. and Santangelo, Andrea and Schlickeiser, R. and Schuessler, F. and Schulz, A. and Schwanke, U. and Schwarzburg, S. and Schwemmer, S. and Sol, H. and Spengler, G. and Spiess, F. and Stawarz, L. and Steenkamp, R. and Stegmann, Christian and Stinzing, F. and Stycz, K. and Sushch, Iurii and Szostek, A. and Tavernet, J-P. and Terrier, R. and Tluczykont, M. and Trichard, C. and Valerius, K. and van Eldik, C. and Vasileiadis, G. and Venter, C. and Viana, A. and Vincent, P. and Voelk, H. J. and Volpe, F. and Vorster, M. and Wagner, S. J. and Wagner, P. and Ward, M. and Weidinger, M. and Weitzel, Q. and White, R. and Wierzcholska, A. and Willmann, P. and Woernlein, A. and Wouters, D. and Zacharias, M. and Zajczyk, A. and Zdziarski, A. A. and Zech, Alraune and Zechlin, H-S.}, title = {Constraints on axionlike particles with HESS from the irregularity of the PKS 2155-304 energy spectrum}, series = {Physical review : D, Particles, fields, gravitation, and cosmology}, volume = {88}, journal = {Physical review : D, Particles, fields, gravitation, and cosmology}, number = {10}, publisher = {American Physical Society}, address = {College Park}, organization = {HESS Collaboration}, issn = {1550-7998}, doi = {10.1103/PhysRevD.88.102003}, pages = {12}, year = {2013}, abstract = {Axionlike particles (ALPs) are hypothetical light (sub-eV) bosons predicted in some extensions of the Standard Model of particle physics. In astrophysical environments comprising high-energy gamma rays and turbulent magnetic fields, the existence of ALPs can modify the energy spectrum of the gamma rays for a sufficiently large coupling between ALPs and photons. This modification would take the form of an irregular behavior of the energy spectrum in a limited energy range. Data from the H. E. S. S. observations of the distant BL Lac object PKS 2155 - 304 (z = 0.116) are used to derive upper limits at the 95\% C. L. on the strength of the ALP coupling to photons, g(gamma a) < 2.1 x 10(-11) GeV-1 for an ALP mass between 15 and 60 neV. The results depend on assumptions on the magnetic field around the source, which are chosen conservatively. The derived constraints apply to both light pseudoscalar and scalar bosons that couple to the electromagnetic field.}, language = {en} } @book{BauckmannAbedjanLeseretal.2012, author = {Bauckmann, Jana and Abedjan, Ziawasch and Leser, Ulf and M{\"u}ller, Heiko and Naumann, Felix}, title = {Covering or complete? : Discovering conditional inclusion dependencies}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-212-4}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-62089}, publisher = {Universit{\"a}t Potsdam}, pages = {34}, year = {2012}, abstract = {Data dependencies, or integrity constraints, are used to improve the quality of a database schema, to optimize queries, and to ensure consistency in a database. In the last years conditional dependencies have been introduced to analyze and improve data quality. In short, a conditional dependency is a dependency with a limited scope defined by conditions over one or more attributes. Only the matching part of the instance must adhere to the dependency. In this paper we focus on conditional inclusion dependencies (CINDs). We generalize the definition of CINDs, distinguishing covering and completeness conditions. We present a new use case for such CINDs showing their value for solving complex data quality tasks. Further, we define quality measures for conditions inspired by precision and recall. We propose efficient algorithms that identify covering and completeness conditions conforming to given quality thresholds. Our algorithms choose not only the condition values but also the condition attributes automatically. Finally, we show that our approach efficiently provides meaningful and helpful results for our use case.}, language = {en} } @misc{LosterNaumannEhmuelleretal.2018, author = {Loster, Michael and Naumann, Felix and Ehmueller, Jan and Feldmann, Benjamin}, title = {CurEx}, series = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, journal = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, publisher = {Association for Computing Machinery}, address = {New York}, isbn = {978-1-4503-6014-2}, doi = {10.1145/3269206.3269229}, pages = {1883 -- 1886}, year = {2018}, abstract = {The integration of diverse structured and unstructured information sources into a unified, domain-specific knowledge base is an important task in many areas. A well-maintained knowledge base enables data analysis in complex scenarios, such as risk analysis in the financial sector or investigating large data leaks, such as the Paradise or Panama papers. Both the creation of such knowledge bases, as well as their continuous maintenance and curation involves many complex tasks and considerable manual effort. With CurEx, we present a modular system that allows structured and unstructured data sources to be integrated into a domain-specific knowledge base. In particular, we (i) enable the incremental improvement of each individual integration component; (ii) enable the selective generation of multiple knowledge graphs from the information contained in the knowledge base; and (iii) provide two distinct user interfaces tailored to the needs of data engineers and end-users respectively. The former has curation capabilities and controls the integration process, whereas the latter focuses on the exploration of the generated knowledge graph.}, language = {en} } @article{KossmannPapenbrockNaumann2021, author = {Koßmann, Jan and Papenbrock, Thorsten and Naumann, Felix}, title = {Data dependencies for query optimization}, series = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment}, volume = {31}, journal = {The VLDB journal : the international journal on very large data bases / publ. on behalf of the VLDB Endowment}, number = {1}, publisher = {Springer}, address = {Berlin ; Heidelberg ; New York}, issn = {1066-8888}, doi = {10.1007/s00778-021-00676-3}, pages = {1 -- 22}, year = {2021}, abstract = {Effective query optimization is a core feature of any database management system. While most query optimization techniques make use of simple metadata, such as cardinalities and other basic statistics, other optimization techniques are based on more advanced metadata including data dependencies, such as functional, uniqueness, order, or inclusion dependencies. This survey provides an overview, intuitive descriptions, and classifications of query optimization and execution strategies that are enabled by data dependencies. We consider the most popular types of data dependencies and focus on optimization strategies that target the optimization of relational database queries. The survey supports database vendors to identify optimization opportunities as well as DBMS researchers to find related work and open research questions.}, language = {en} } @article{HameedNaumann2020, author = {Hameed, Mazhar and Naumann, Felix}, title = {Data Preparation}, series = {SIGMOD record}, volume = {49}, journal = {SIGMOD record}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0163-5808}, doi = {10.1145/3444831.3444835}, pages = {18 -- 29}, year = {2020}, abstract = {Raw data are often messy: they follow different encodings, records are not well structured, values do not adhere to patterns, etc. Such data are in general not fit to be ingested by downstream applications, such as data analytics tools, or even by data management systems. The act of obtaining information from raw data relies on some data preparation process. Data preparation is integral to advanced data analysis and data management, not only for data science but for any data-driven applications. Existing data preparation tools are operational and useful, but there is still room for improvement and optimization. With increasing data volume and its messy nature, the demand for prepared data increases day by day.
To cater to this demand, companies and researchers are developing techniques and tools for data preparation. To better understand the available data preparation systems, we have conducted a survey to investigate (1) prominent data preparation tools, (2) distinctive tool features, (3) the need for preliminary data processing even for these tools and, (4) features and abilities that are still lacking. We conclude with an argument in support of automatic and intelligent data preparation beyond traditional and simplistic techniques.}, language = {en} } @article{KoumarelasJiangNaumann2020, author = {Koumarelas, Ioannis and Jiang, Lan and Naumann, Felix}, title = {Data preparation for duplicate detection}, series = {Journal of data and information quality : (JDIQ)}, volume = {12}, journal = {Journal of data and information quality : (JDIQ)}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3377878}, pages = {24}, year = {2020}, abstract = {Data errors represent a major issue in most application workflows. Before any important task can take place, a certain data quality has to be guaranteed by eliminating a number of different errors that may appear in data. Typically, most of these errors are fixed with data preparation methods, such as whitespace removal. However, the particular error of duplicate records, where multiple records refer to the same entity, is usually eliminated independently with specialized techniques. Our work is the first to bring these two areas together by applying data preparation operations under a systematic approach prior to performing duplicate detection.
Our process workflow can be summarized as follows: It begins with the user providing as input a sample of the gold standard, the actual dataset, and optionally some constraints to domain-specific data preparations, such as address normalization. The preparation selection operates in two consecutive phases. First, to vastly reduce the search space of ineffective data preparations, decisions are made based on the improvement or worsening of pair similarities. Second, using the remaining data preparations an iterative leave-one-out classification process removes preparations one by one and determines the redundant preparations based on the achieved area under the precision-recall curve (AUC-PR). Using this workflow, we manage to improve the results of duplicate detection up to 19\% in AUC-PR.}, language = {en} } @book{AbedjanGolabNaumannetal., author = {Abedjan, Ziawasch and Golab, Lukasz and Naumann, Felix and Papenbrock, Thorsten}, title = {Data Profiling}, series = {Synthesis lectures on data management, 52}, journal = {Synthesis lectures on data management, 52}, publisher = {Morgan \& Claypool Publishers}, address = {San Rafael}, isbn = {978-1-68173-446-0}, pages = {xviii, 136}, language = {en} } @article{ActisAgnettaAharonianetal.2011, author = {Actis, M. and Agnetta, G. and Aharonian, Felix A. and Akhperjanian, A. G. and Aleksic, J. and Aliu, E. and Allan, D. and Allekotte, I. and Antico, F. and Antonelli, L. A. and Antoranz, P. and Aravantinos, A. and Arlen, T. and Arnaldi, H. and Artmann, S. and Asano, K. and Asorey, H. G. and Baehr, J. and Bais, A. and Baixeras, C. and Bajtlik, S. and Balis, D. and Bamba, A. and Barbier, C. and Barcelo, M. and Barnacka, Anna and Barnstedt, J{\"u}rgen and de Almeida, U. Barres and Barrio, J. A. and Basso, S. and Bastieri, D. and Bauer, C. and Becerra Gonzalez, J. and Becherini, Yvonne and Bechtol, K. C. and Becker, J. and Beckmann, Volker and Bednarek, W. and Behera, B. and Beilicke, M. and Belluso, M. and Benallou, M. and Benbow, W. and Berdugo, J. and Berger, K. and Bernardino, T. and Bernl{\"o}hr, K. and Biland, A. and Billotta, S. and Bird, T. and Birsin, E. and Bissaldi, E. and Blake, S. and Blanch Bigas, O. and Bobkov, A. A. and Bogacz, L. and Bogdan, M. and Boisson, Catherine and Boix Gargallo, J. and Bolmont, J. and Bonanno, G. and Bonardi, A. and Bonev, T. and Borkowski, Janett and Botner, O. and Bottani, A. and Bourgeat, M. and Boutonnet, C. and Bouvier, A. and Brau-Nogue, S. and Braun, I. and Bretz, T. and Briggs, M. S. and Brun, Pierre and Brunetti, L. and Buckley, H. and Bugaev, V. and Buehler, R. and Bulik, Tomasz and Busetto, G. and Buson, S. and Byrum, K. and Cailles, M. and Cameron, R. A. and Canestrari, R. and Cantu, S. and Carmona, E. and Carosi, A. and Carr, John and Carton, P. H. and Casiraghi, M. and Castarede, H. and Catalano, O. and Cavazzani, S. and Cazaux, S. and Cerruti, B. and Cerruti, M. and Chadwick, M. and Chiang, J. and Chikawa, M. and Cieslar, M. and Ciesielska, M. and Cillis, A. N. and Clerc, C. and Colin, P. and Colome, J. and Compin, M. and Conconi, P. and Connaughton, V. and Conrad, Jan and Contreras, J. L. and Coppi, P. and Corlier, M. and Corona, P. and Corpace, O. and Corti, D. and Cortina, J. and Costantini, H. and Cotter, G. and Courty, B. and Couturier, S. and Covino, S. and Croston, J. and Cusumano, G. and Daniel, M. K. and Dazzi, F. and Deangelis, A. and de Cea del Pozo, E. and Dal Pino, E. M. de Gouveia and de Jager, O. and de la Calle Perez, I. and De La Vega, G. and De Lotto, B. and de Naurois, M. and Wilhelmi, E. de Ona and de Souza, V. and Decerprit, B. and Deil, C. and Delagnes, E. and Deleglise, G. and Delgado, C. and Dettlaff, T. and Di Paolo, A. and Di Pierro, F. and Diaz, C. and Dick, J. and Dickinson, H. and Digel, S. W. and Dimitrov, D. and Disset, G. and Djannati-Ata{\"i}, A. and Doert, M. and Domainko, W. and Dorner, D. and Doro, M. and Dournaux, J. -L. and Dravins, D. and Drury, L. and Dubois, F. and Dubois, R. and Dubus, G. and Dufour, C. and Durand, D. and Dyks, J. and Dyrda, M. and Edy, E. and Egberts, Kathrin and Eleftheriadis, C. and Elles, S. and Emmanoulopoulos, D. and Enomoto, R. and Ernenwein, J. -P. and Errando, M. and Etchegoyen, A. and Falcone, A. D. and Farakos, K. and Farnier, C. and Federici, S. and Feinstein, F. and Ferenc, D. and Fillin-Martino, E. and Fink, D. and Finley, C. and Finley, J. P. and Firpo, R. and Florin, D. and Foehr, C. and Fokitis, E. and Font, Ll. and Fontaine, G. and Fontana, A. and Foerster, A. and Fortson, L. and Fouque, N. and Fransson, C. and Fraser, G. W. and Fresnillo, L. and Fruck, C. and Fujita, Y. and Fukazawa, Y. and Funk, S. and Gaebele, W. and Gabici, S. and Gadola, A. and Galante, N. and Gallant, Y. and Garcia, B. and Garcia Lopez, R. J. and Garrido, D. and Garrido, L. and Gascon, D. and Gasq, C. and Gaug, M. and Gaweda, J. and Geffroy, N. and Ghag, C. and Ghedina, A. and Ghigo, M. and Gianakaki, E. and Giarrusso, S. and Giavitto, G. and Giebels, B. and Giro, E. and Giubilato, P. and Glanzman, T. and Glicenstein, J. -F. and Gochna, M. and Golev, V. and Gomez Berisso, M. and Gonzalez, A. and Gonzalez, F. and Granena, F. and Graciani, R. and Granot, J. and Gredig, R. and Green, A. and Greenshaw, T. and Grimm, O. and Grube, J. and Grudzinska, M. and Grygorczuk, J. and Guarino, V. and Guglielmi, L. and Guilloux, F. and Gunji, S. and Gyuk, G. and Hadasch, D. and Haefner, D. and Hagiwara, R. and Hahn, J. and Hallgren, A. and Hara, S. and Hardcastle, M. J. and Hassan, T. and Haubold, T. and Hauser, M. and Hayashida, M. and Heller, R. and Henri, G. and Hermann, G. and Herrero, A. and Hinton, James Anthony and Hoffmann, D. and Hofmann, W. and Hofverberg, P. and Horns, D. and Hrupec, D. and Huan, H. and Huber, B. and Huet, J. -M. and Hughes, G. and Hultquist, K. and Humensky, T. B. and Huppert, J. -F. and Ibarra, A. and Illa, J. M. and Ingjald, J. and Inoue, S. and Inoue, Y. and Ioka, K. and Jablonski, C. and Jacholkowska, A. and Janiak, M. and Jean, P. and Jensen, H. and Jogler, T. and Jung, I. and Kaaret, P. and Kabuki, S. and Kakuwa, J. and Kalkuhl, C. and Kankanyan, R. and Kapala, M. and Karastergiou, A. and Karczewski, M. and Karkar, S. and Karlsson, N. and Kasperek, J. and Katagiri, H. and Katarzynski, K. and Kawanaka, N. and Kedziora, B. and Kendziorra, E. and Khelifi, B. and Kieda, D. and Kifune, T. and Kihm, T. and Klepser, S. and Kluzniak, W. and Knapp, J. and Knappy, A. R. and Kneiske, T. and Knoedlseder, J. and Koeck, F. and Kodani, K. and Kohri, K. and Kokkotas, K. and Komin, N. and Konopelko, A. and Kosack, K. and Kossakowski, R. and Kostka, P. and Kotula, J. and Kowal, G. and Koziol, J. and Kraehenbuehl, T. and Krause, J. and Krawczynski, H. and Krennrich, F. and Kretzschmann, A. and Kubo, H. and Kudryavtsev, V. A. and Kushida, J. and La Barbera, N. and La Parola, V. and La Rosa, G. and Lopez, A. and Lamanna, G. and Laporte, P. and Lavalley, C. and Le Flour, T. and Le Padellec, A. and Lenain, J. -P. and Lessio, L. and Lieunard, B. and Lindfors, E. and Liolios, A. and Lohse, T. and Lombardi, S. and Lopatin, A. and Lorenz, E. and Lubinski, P. and Luz, O. and Lyard, E. and Maccarone, M. C. and Maccarone, T. and Maier, G. and Majumdar, P. and Maltezos, S. and Malkiewicz, P. and Mana, C. and Manalaysay, A. and Maneva, G. and Mangano, A. and Manigot, P. and Marin, J. and Mariotti, M. and Markoff, S. and Martinez, G. and Martinez, M. and Mastichiadis, A. and Matsumoto, H. and Mattiazzo, S. and Mazin, D. and McComb, T. J. L. and McCubbin, N. and McHardy, I. and Medina, C. and Melkumyan, D. and Mendes, A. and Mertsch, P. and Meucci, M. and Michalowski, J. and Micolon, P. and Mineo, T. and Mirabal, N. and Mirabel, F. and Miranda, J. M. and Mirzoyan, R. and Mizuno, T. and Moal, B. and Moderski, R. and Molinari, E. and Monteiro, I. and Moralejo, A. and Morello, C. and Mori, K. and Motta, G. and Mottez, F. and Moulin, Emmanuel and Mukherjee, R. and Munar, P. and Muraishi, H. and Murase, K. and Murphy, A. Stj. and Nagataki, S. and Naito, T. and Nakamori, T. and Nakayama, K. and Naumann, C. L. and Naumann, D. and Nayman, P. and Nedbal, D. and Niedzwiecki, A. and Niemiec, J. and Nikolaidis, A. and Nishijima, K. and Nolan, S. J. and Nowak, N. and O'Brien, P. T. and Ochoa, I. and Ohira, Y. and Ohishi, M. and Ohka, H. and Okumura, A. and Olivetto, C. and Ong, R. A. and Orito, R. and Orr, M. and Osborne, J. P. and Ostrowski, M. and Otero, L. and Otte, A. N. and Ovcharov, E. and Oya, I. and Ozieblo, A. and Paiano, S. and Pallota, J. and Panazol, J. L. and Paneque, D. and Panter, M. and Paoletti, R. and Papyan, G. and Paredes, J. M. and Pareschi, G. and Parsons, R. D. and Arribas, M. Paz and Pedaletti, G. and Pepato, A. and Persic, M. and Petrucci, P. O. and Peyaud, B. and Piechocki, W. and Pita, S. and Pivato, G. and Platos, L. and Platzer, R. and Pogosyan, L. and Pohl, Martin and Pojmanski, G. and Ponz, J. D. and Potter, W. and Prandini, E. and Preece, R. and Prokoph, H. and Puehlhofer, G. and Punch, M. and Quel, E. and Quirrenbach, A. and Rajda, P. and Rando, R. and Rataj, M. and Raue, M. and Reimann, C. and Reimann, O. and Reimer, A. and Reimer, O. and Renaud, M. and Renner, S. and Reymond, J. -M. and Rhode, W. and Ribo, M. and Ribordy, M. and Rico, J. and Rieger, F. and Ringegni, P. and Ripken, J. and Ristori, P. and Rivoire, S. and Rob, L. and Rodriguez, S. and Roeser, U. and Romano, Patrizia and Romero, G. E. and Rosier-Lees, S. and Rovero, A. C. and Roy, F. and Royer, S. and Rudak, B. and Rulten, C. B. and Ruppel, J. and Russo, F. and Ryde, F. and Sacco, B. and Saggion, A. and Sahakian, V. and Saito, K. and Saito, T. and Sakaki, N. and Salazar, E. and Salini, A. and Sanchez, F. and Sanchez Conde, M. A. and Santangelo, Andrea and Santos, E. M. and Sanuy, A. and Sapozhnikov, L. and Sarkar, S. and Scalzotto, V. and Scapin, V. and Scarcioffolo, M. and Schanz, T. and Schlenstedt, S. and Schlickeiser, R. and Schmidt, T. and Schmoll, J. and Schroedter, M. and Schultz, C. and Schultze, J. and Schulz, A. and Schwanke, U. and Schwarzburg, S. and Schweizer, T. and Seiradakis, J. and Selmane, S. and Seweryn, K. and Shayduk, M. and Shellard, R. C. and Shibata, T. and Sikora, M. and Silk, J. and Sillanpaa, A. and Sitarek, J. and Skole, C. and Smith, N. and Sobczynska, D. and Sofo Haro, M. and Sol, H. and Spanier, F. and Spiga, D. and Spyrou, S. and Stamatescu, V. and Stamerra, A. and Starling, R. L. C. and Stawarz, L. and Steenkamp, R. and Stegmann, Christian and Steiner, S. and Stergioulas, N. and Sternberger, R. and Stinzing, F. and Stodulski, M. and Straumann, U. and Suarez, A. and Suchenek, M. and Sugawara, R. and Sulanke, K. H. and Sun, S. and Supanitsky, A. D. and Sutcliffe, P. and Szanecki, M. and Szepieniec, T. and Szostek, A. and Szymkowiak, A. and Tagliaferri, G. and Tajima, H. and Takahashi, H. and Takahashi, K. and Takalo, L. and Takami, H. and Talbot, R. G. and Tam, P. H. and Tanaka, M. and Tanimori, T. and Tavani, M. and Tavernet, J. -P. and Tchernin, C. and Tejedor, L. A. and Telezhinsky, Igor O. and Temnikov, P. and Tenzer, C. and Terada, Y. and Terrier, R. and Teshima, M. and Testa, V. and Tibaldo, L. and Tibolla, O. and Tluczykont, M. and Peixoto, C. J. Todero and Tokanai, F. and Tokarz, M. and Toma, K. and Torres, D. F. and Tosti, G. and Totani, T. and Toussenel, F. and Vallania, P. and Vallejo, G. and van der Walt, J. and van Eldik, C. and Vandenbroucke, J. and Vankov, H. and Vasileiadis, G. and Vassiliev, V. V. and Vegas, I. and Venter, L. and Vercellone, S. and Veyssiere, C. and Vialle, J. P. and Videla, M. and Vincent, P. and Vink, J. and Vlahakis, N. and Vlahos, L. and Vogler, P. and Vollhardt, A. and Volpe, F. and Von Gunten, H. P. and Vorobiov, S. and Wagner, S. and Wagner, R. M. and Wagner, B. and Wakely, S. P. and Walter, P. and Walter, R. and Warwick, R. and Wawer, P. and Wawrzaszek, R. and Webb, N. and Wegner, P. and Weinstein, A. and Weitzel, Q. and Welsing, R. and Wetteskind, H. and White, R. and Wierzcholska, A. and Wilkinson, M. I. and Williams, D. A. and Winde, M. and Wischnewski, R. and Wisniewski, L. and Wolczko, A. and Wood, M. and Xiong, Q. and Yamamoto, T. and Yamaoka, K. and Yamazaki, R. and Yanagita, S. and Yoffo, B. and Yonetani, M. and Yoshida, A. and Yoshida, T. and Yoshikoshi, T. and Zabalza, V. and Zagdanski, A. and Zajczyk, A. and Zdziarski, A. and Zech, Alraune and Zietara, K. and Ziolkowski, P. and Zitelli, V. and Zychowski, P.}, title = {Design concepts for the Cherenkov Telescope Array CTA an advanced facility for ground-based high-energy gamma-ray astronomy}, series = {Experimental astronomy : an international journal on astronomical instrumentation and data analysis}, volume = {32}, journal = {Experimental astronomy : an international journal on astronomical instrumentation and data analysis}, number = {3}, publisher = {Springer}, address = {Dordrecht}, organization = {CTA Consortium}, issn = {0922-6435}, doi = {10.1007/s10686-011-9247-0}, pages = {193 -- 316}, year = {2011}, abstract = {Ground-based gamma-ray astronomy has had a major breakthrough with the impressive results obtained using systems of imaging atmospheric Cherenkov telescopes. Ground-based gamma-ray astronomy has a huge potential in astrophysics, particle physics and cosmology. CTA is an international initiative to build the next generation instrument, with a factor of 5-10 improvement in sensitivity in the 100 GeV-10 TeV range and the extension to energies well below 100 GeV and above 100 TeV. CTA will consist of two arrays (one in the north, one in the south) for full sky coverage and will be operated as open observatory. The design of CTA is based on currently available technology. This document reports on the status and presents the major design concepts of CTA.}, language = {en} } @article{VitaglianoJiangNaumann2021, author = {Vitagliano, Gerardo and Jiang, Lan and Naumann, Felix}, title = {Detecting layout templates in complex multiregion files}, series = {Proceedings of the VLDB Endowment}, volume = {15}, journal = {Proceedings of the VLDB Endowment}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3494124.3494145}, pages = {646 -- 658}, year = {2021}, abstract = {Spreadsheets are among the most commonly used file formats for data management, distribution, and analysis. Their widespread employment makes it easy to gather large collections of data, but their flexible canvas-based structure makes automated analysis difficult without heavy preparation. One of the common problems that practitioners face is the presence of multiple, independent regions in a single spreadsheet, possibly separated by repeated empty cells. We define such files as "multiregion" files. In collections of various spreadsheets, we can observe that some share the same layout. We present the Mondrian approach to automatically identify layout templates across multiple files and systematically extract the corresponding regions. Our approach is composed of three phases: first, each file is rendered as an image and inspected for elements that could form regions; then, using a clustering algorithm, the identified elements are grouped to form regions; finally, every file layout is represented as a graph and compared with others to find layout templates. We compare our method to state-of-the-art table recognition algorithms on two corpora of real-world enterprise spreadsheets. Our approach shows the best performances in detecting reliable region boundaries within each file and can correctly identify recurring layouts across files.}, language = {en} } @article{CaruccioDeufemiaNaumannetal.2021, author = {Caruccio, Loredana and Deufemia, Vincenzo and Naumann, Felix and Polese, Giuseppe}, title = {Discovering relaxed functional dependencies based on multi-attribute dominance}, series = {IEEE transactions on knowledge and data engineering}, volume = {33}, journal = {IEEE transactions on knowledge and data engineering}, number = {9}, publisher = {Institute of Electrical and Electronics Engineers}, address = {New York, NY}, issn = {1041-4347}, doi = {10.1109/TKDE.2020.2967722}, pages = {3212 -- 3228}, year = {2021}, abstract = {With the advent of big data and data lakes, data are often integrated from multiple sources. Such integrated data are often of poor quality, due to inconsistencies, errors, and so forth. One way to check the quality of data is to infer functional dependencies (fds). However, in many modern applications it might be necessary to extract properties and relationships that are not captured through fds, due to the necessity to admit exceptions, or to consider similarity rather than equality of data values. Relaxed fds (rfds) have been introduced to meet these needs, but their discovery from data adds further complexity to an already complex problem, also due to the necessity of specifying similarity and validity thresholds. We propose Domino, a new discovery algorithm for rfds that exploits the concept of dominance in order to derive similarity thresholds of attribute values while inferring rfds. An experimental evaluation on real datasets demonstrates the discovery performance and the effectiveness of the proposed algorithm.}, language = {en} } @article{BertiEquilleHarmouchNaumannetal.2018, author = {Berti-Equille, Laure and Harmouch, Nazar and Naumann, Felix and Novelli, Noel and Saravanan, Thirumuruganathan}, title = {Discovery of genuine functional dependencies from relational data with missing values}, series = {Proceedings of the VLDB Endowment}, volume = {11}, journal = {Proceedings of the VLDB Endowment}, number = {8}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3204028.3204032}, pages = {880 -- 892}, year = {2018}, abstract = {Functional dependencies (FDs) play an important role in maintaining data quality. They can be used to enforce data consistency and to guide repairs over a database. In this work, we investigate the problem of missing values and its impact on FD discovery. When using existing FD discovery algorithms, some genuine FDs could not be detected precisely due to missing values or some non-genuine FDs can be discovered even though they are caused by missing values with a certain NULL semantics. We define a notion of genuineness and propose algorithms to compute the genuineness score of a discovered FD. This can be used to identify the genuine FDs among the set of all valid dependencies that hold on the data. We evaluate the quality of our method over various real-world and semi-synthetic datasets with extensive experiments. The results show that our method performs well for relatively large FD sets and is able to accurately capture genuine FDs.}, language = {en} } @article{AbramowskiAceroAharonianetal.2013, author = {Abramowski, Attila and Acero, F. and Aharonian, Felix A. and Akhperjanian, A. G. and Ang{\"u}ner, Ekrem Oǧuzhan and Anton, Gisela and Balenderan, Shangkari and Balzer, Arnim and Barnacka, Anna and Becherini, Yvonne and Tjus, J. Becker and Bernl{\"o}hr, K. and Birsin, E. and Bissaldi, E. and Biteau, Jonathan and Boisson, Catherine and Bolmont, J. and Bordas, Pol and Brucker, J. and Brun, Francois and Brun, Pierre and Bulik, Tomasz and Carrigan, Svenja and Casanova, Sabrina and Cerruti, M. and Chadwick, Paula M. and Chalme-Calvet, R. and Chaves, Ryan C. G. and Cheesebrough, A. and Chretien, M. and Colafrancesco, Sergio and Cologna, Gabriele and Conrad, Jan and Couturier, C. and Dalton, M. and Daniel, M. K. and Davids, I. D. and Degrange, B. and Deil, C. and deWilt, P. and Dickinson, H. J. and Djannati-Ata{\"i}, A. and Domainko, W. and Drury, L. O'C. and Dubus, G. and Dutson, K. and Dyks, J. and Dyrda, M. and Edwards, T. and Egberts, Kathrin and Eger, P. and Espigat, P. and Farnier, C. and Fegan, S. and Feinstein, F. and Fernandes, M. V. and Fernandez, D. and Fiasson, A. and Fontaine, G. and Foerster, A. and Fuessling, M. and Gajdus, M. and Gallant, Y. A. and Garrigoux, T. and Gast, H. and Giebels, B. and Glicenstein, J. F. and Goering, D. and Grondin, M. -H. and Grudzinska, M. and Haeffner, S. and Hague, J. D. and Hahn, J. and Harris, J. and Heinzelmann, G. and Henri, G. and Hermann, G. and Hervet, O. and Hillert, A. and Hinton, James Anthony and Hofmann, W. and Hofverberg, P. and Holler, Markus and Horns, D. and Jacholkowska, A. and Jahn, C. and Jamrozy, M. and Janiak, M. and Jankowsky, F. and Jung, I. and Kastendieck, M. A. and Katarzynski, K. and Katz, U. and Kaufmann, S. and Khelifi, B. and Kieffer, M. and Klepser, S. and Klochkov, D. and Kluzniak, W. and Kneiske, T. and Kolitzus, D. and Komin, Nu. and Kosack, K. and Krakau, S. and Krayzel, F. and Krueger, P. P. and Laffon, H. and Lamanna, G. and Lefaucheur, J. and Lemoine-Goumard, M. and Lenain, J. -P. and Lennarz, D. and Lohse, T. and Lopatin, A. and Lu, C. -C. and Marandon, V. and Marcowith, Alexandre and Maxted, N. and Mayer, M. and McComb, T. J. L. and Medina, M. C. and Mehault, J. and Menzler, U. and Meyer, M. and Moderski, R. and Mohamed, M. and Moulin, Emmanuel and Murach, T. and Naumann, C. L. and de Naurois, M. and Nedbal, D. and Niemiec, J. and Nolan, S. J. and Oakes, L. and Ohm, S. and Wilhelmi, E. de Ona and Opitz, B. and Ostrowski, M. and Oya, I. and Panter, M. and Parsons, R. D. and Arribas, M. Paz and Pekeur, N. W. and Pelletier, G. and Perez, J. and Petrucci, P. -O. and Peyaud, B. and Pita, S. and Poon, H. and Punch, M. and Quirrenbach, A. and Raab, S. and Raue, M. and Reimer, A. and Reimer, O. and Renaud, M. and de los Reyes, R. and Rieger, F. and Rob, L. and Rosier-Lees, S. and Rowell, G. and Rudak, B. and Rulten, C. B. and Sahakian, V. and Sanchez, David M. and Santangelo, Andrea and Schlickeiser, R. and Schuessler, F. and Schulz, A. and Schwanke, U. and Schwarzburg, S. and Schwemmer, S. and Sol, H. and Spengler, G. and Spiess, F. and Stawarz, L. and Steenkamp, R. and Stegmann, Christian and Stinzing, F. and Stycz, K. and Sushch, Iurii and Szostek, A. and Tavernet, J. -P. and Terrier, R. and Tluczykont, M. and Trichard, C. and Valerius, K. and van Eldik, C. and Vasileiadis, G. and Venter, C. and Viana, A. and Vincent, P. and Voelk, H. J. and Volpe, F. and Vorster, M. and Wagner, S. J. and Wagner, P. and Ward, M. and Weidinger, M. and White, R. and Wierzcholska, A. and Willmann, P. and Woernlein, A. and Wouters, D. and Zacharias, M. and Zajczyk, A. and Zdziarski, A. A. and Zech, Alraune and Zechlin, H. -S.}, title = {Discovery of high and very high-energy emission from the BL Lacertae object SHBL J001355.9-185406}, series = {Astronomy and astrophysics : an international weekly journal}, volume = {554}, journal = {Astronomy and astrophysics : an international weekly journal}, publisher = {EDP Sciences}, address = {Les Ulis}, organization = {HESSCollaboration}, issn = {0004-6361}, doi = {10.1051/0004-6361/201220996}, pages = {8}, year = {2013}, abstract = {The detection of the high-frequency peaked BL Lac object (HBL) SHBL J001355.9-185406 (z = 0.095) at high (HE; 100 MeV < E < 300 GeV) and very high-energy (VHE; E > 100 GeV) with the Fermi Large Area Telescope (LAT) and the High Energy Stereoscopic System (H.E.S.S.) is reported. Dedicated observations were performed with the H. E. S. S. telescopes, leading to a detection at the 5.5 sigma significance level. The measured flux above 310 GeV is (8.3 +/- 1.7(stat) +/- 1.7(sys)) x 10(-13) photons cm(-2) s(-1) (about 0.6\% of that of the Crab Nebula), and the power-law spectrum has a photon index of Gamma = 3.4 +/- 0.5(stat) +/- 0.2(sys). Using 3.5 years of publicly available Fermi-LAT data, a faint counterpart has been detected in the LAT data at the 5.5 sigma significance level, with an integrated flux above 300 MeV of (9.3 +/- 3.4(stat) +/- 0.8(sys)) x 10(-10) photons cm(-2) s(-1) and a photon index of Gamma = 1.96 +/- 0.20(stat) +/- 0.08(sys). X-ray observations with Swift-XRT allow the synchrotron peak energy in vF(v) representation to be located at similar to 1.0 keV. The broadband spectral energy distribution is modelled with a one-zone synchrotron self-Compton (SSC) model and the optical data by a black-body emission describing the thermal emission of the host galaxy. The derived parameters are typical of HBLs detected at VHE, with a particle-dominated jet.}, language = {en} } @article{AbramowskiAceroAharonianetal.2015, author = {Abramowski, Attila and Acero, F. and Aharonian, Felix A. and Benkhali, Faical Ait and Akhperjanian, A. G. and Ang{\"u}ner, Ekrem Oǧuzhan and Anton, Gisela and Balenderan, Shangkari and Balzer, Arnim and Barnacka, Anna and Becherini, Yvonne and Tjus, J. Becker and Bernl{\"o}hr, K. and Birsin, E. and Bissaldi, E. and Biteau, Jonathan and Boettcher, Markus and Boisson, Catherine and Bolmont, J. and Bordas, Pol and Brucker, J. and Brun, Francois and Brun, Pierre and Bulik, Tomasz and Carrigan, Svenja and Casanova, Sabrina and Cerruti, M. and Chadwick, Paula M. and Chalme-Calvet, R. and Chaves, Ryan C. G. and Cheesebrough, A. and Chretien, M. and Clapson, A. C. and Colafrancesco, Sergio and Cologna, Gabriele and Conrad, Jan and Couturier, C. and Cui, Y. and Dalton, M. and Daniel, M. K. and Davids, I. D. and Degrange, B. and Deil, C. and deWilt, P. and Dickinson, H. J. and Djannati-Ata{\"i}, A. and Domainko, W. and Dubus, G. and Dutson, K. and Dyks, J. and Dyrda, M. and Edwards, T. and Egberts, Kathrin and Eger, P. and Espigat, P. and Farnier, C. and Fegan, S. and Feinstein, F. and Fernandes, M. V. and Fernandez, D. and Fiasson, A. and Fontaine, G. and Foerster, A. and Fuessling, M. and Gajdus, M. and Gallant, Y. A. and Garrigoux, T. and Giavitto, G. and Giebels, B. and Glicenstein, J. F. and Grondin, M. -H. and Grudzinska, M. and Haeffner, S. and Hahn, J. and Harris, J. and Heinzelmann, G. and Henri, G. and Hermann, G. and Hervet, O. and Hillert, A. and Hinton, James Anthony and Hofmann, W. and Hofverberg, P. and Holler, Markus and Horns, D. and Jacholkowska, A. and Jahn, C. and Jamrozy, Marek and Janiak, M. and Jankowsky, F. and Jung, I. and Kastendieck, M. A. and Katarzynski, Krzysztof and Katz, Uli and Kaufmann, S. and Khelifi, B. and Kieffer, M. and Klepser, S. and Klochkov, D. and Kluzniak, W. and Kneiske, T. and Kolitzus, D. and Komin, Nu. and Kosack, K. and Krakau, S. and Krayzel, F. and Krueger, P. P. and Laffon, H. and Lamanna, G. and Lefaucheur, J. and Lemiere, A. and Lemoine-Goumard, M. and Lenain, J. -P. and Lennarz, D. and Lohse, T. and Lopatin, A. and Lu, C. -C. and Marandon, V. and Marcowith, Alexandre and Marx, R. and Maurin, G. and Maxted, N. and Mayer, M. and McComb, T. J. L. and Mehault, J. and Meintjes, P. J. and Menzler, U. and Meyer, Manuel and Moderski, R. and Mohamed, M. and Moulin, Emmanuel and Murach, T. and Naumann, C. L. and de Naurois, M. and Niemiec, J. and Nolan, S. J. and Oakes, L. and Ohm, S. and Wilhelmi, E. de Ona and Opitz, B. and Ostrowski, M. and Oya, I. and Panter, M. and Parsons, R. D. and Arribas, M. Paz and Pekeur, N. W. and Pelletier, G. and Perez, J. and Petrucci, P. -O. and Peyaud, B. and Pita, S. and Poon, H. and Puehlhofer, G. and Punch, M. and Quirrenbach, A. and Raab, S. and Raue, M. and Reimer, A. and Reimer, O. and Renaud, M. and de los Reyes, R. and Rieger, F. and Rob, L. and Romoli, C. and Rosier-Lees, S. and Rowell, G. and Rudak, B. and Rulten, C. B. and Sahakian, V. and Sanchez, David M. and Santangelo, Andrea and Schlickeiser, R. and Schuessler, F. and Schulz, A. and Schwanke, U. and Schwarzburg, S. and Schwemmer, S. and Sol, H. and Spengler, G. and Spies, F. and Stawarz, L. and Steenkamp, R. and Stegmann, Christian and Stinzing, F. and Stycz, K. and Sushch, Iurii and Szostek, A. and Tavernet, J. -P. and Tavernier, T. and Taylor, A. M. and Terrier, R. and Tluczykont, M. and Trichard, C. and Valerius, K. and van Eldik, Christopher and van Soelen, B. and Vasileiadis, G. and Venter, C. and Viana, A. and Vincent, P. and Voelk, H. J. and Volpe, F. and Vorster, M. and Vuillaume, T. and Wagner, S. J. and Wagner, P. and Ward, M. and Weidinger, M. and Weitzel, Q. and White, R. and Wierzcholska, A. and Willmann, P. and Woernlein, A. and Wouters, D. and Zabalza, V. and Zacharias, M. and Zajczyk, A. and Zdziarski, A. A. and Zech, Alraune and Zechlin, H. -S.}, title = {Discovery of the VHE gamma-ray source HESS J1832-093 in the vicinity of SNR G22.7-0.2}, series = {Monthly notices of the Royal Astronomical Society}, volume = {446}, journal = {Monthly notices of the Royal Astronomical Society}, number = {2}, publisher = {Oxford Univ. Press}, address = {Oxford}, organization = {HESS Collaboration}, issn = {0035-8711}, doi = {10.1093/mnras/stu2148}, pages = {1163 -- 1169}, year = {2015}, language = {en} } @article{AbramowskiAceroAharonianetal.2013, author = {Abramowski, Attila and Acero, F. and Aharonian, Felix A. and Benkhali, Faical Ait and Akhperjanian, A. G. and Ang{\"u}ner, Ekrem Oǧuzhan and Anton, Gisela and Balenderan, Shangkari and Balzer, Arnim and Barnacka, Anna and Becherini, Yvonne and Tjus, J. Becker and Bernl{\"o}hr, K. and Birsin, E. and Bissaldi, E. and Biteau, Jonathan and Boettcher, Markus and Boisson, Catherine and Bolmont, J. and Bordas, Pol and Brucker, J. and Brun, Francois and Brun, Pierre and Bulik, Tomasz and Carrigan, Svenja and Casanova, Sabrina and Cerruti, M. and Chadwick, Paula M. and Chalme-Calvet, R. and Chaves, Ryan C. G. and Cheesebrough, A. and Chretien, M. and Colafrancesco, Sergio and Cologna, Gabriele and Conrad, Jan and Couturier, C. and Dalton, M. and Daniel, M. K. and Davids, I. D. and Degrange, B. and Deil, C. and deWilt, P. and Dickinson, H. J. and Djannati-Ata{\"i}, A. and Domainko, W. and Drury, L. O\&rsquo and C., and Dubus, G. and Dutson, K. and Dyks, J. and Dyrda, M. and Edwards, T. and Egberts, Kathrin and Eger, P. and Espigat, P. and Farnier, C. and Fegan, S. and Feinstein, F. and Fernandes, M. V. and Fernandez, D. and Fiasson, A. and Fontaine, G. and Foerster, A. and Fuessling, M. and Gajdus, M. and Gallant, Y. A. and Garrigoux, T. and Giebels, B. and Glicenstein, J. F. and Grondin, M. -H. and Grudzinska, M. and Haeffner, S. and Hague, J. D. and Hahn, J. and Harris, J. and Heinzelmann, G. and Henri, G. and Hermann, G. and Hervet, O. and Hillert, A. and Hinton, James Anthony and Hofmann, W. and Hofverberg, P. and Holler, M. and Horns, D. and Jacholkowska, A. and Jahn, C. and Jamrozy, M. and Janiak, M. and Jankowsky, F. and Jung, I. and Kastendieck, M. A. and Katarzynski, K. and Katz, U. and Kaufmann, S. and Khelifi, B. and Kieffer, M. and Klepser, S. and Klochkov, D. and Kluzniak, W. and Kneiske, T. and Kolitzus, D. and Komin, Nu. and Kosack, K. and Krakau, S. and Krayzel, F. and Krueger, P. P. and Laffon, H. and Lamanna, G. and Lefaucheur, J. and Lemoine-Goumard, M. and Lenain, J. -P. and Lennarz, D. and Lohse, T. and Lopatin, A. and Lu, C. -C. and Marandon, V. and Marcowith, Alexandre and Marx, R. and Maurin, G. and Maxted, N. and Mayer, M. and McComb, T. J. L. and Medina, M. C. and Mehault, J. and Menzler, U. and Meyer, M. and Moderski, R. and Mohamed, M. and Moulin, Emmanuel and Murach, T. and Naumann, C. L. and de Naurois, M. and Nedbal, D. and Niemiec, J. and Nolan, S. J. and Oakes, L. and Ohm, S. and Wilhelmi, E. de Ona and Opitz, B. and Ostrowski, M. and Oya, I. and Panter, M. and Parsons, R. D. and Arribas, M. Paz and Pekeur, N. W. and Pelletier, G. and Perez, J. and Petrucci, P. -O. and Peyaud, B. and Pita, S. and Poon, H. and Puehlhofer, G. and Punch, M. and Quirrenbach, A. and Raab, S. and Raue, M. and Reimer, A. and Reimer, O. and Renaud, M. and de los Reyes, R. and Rieger, F. and Rob, L. and Rosier-Lees, S. and Rowell, G. and Rudak, B. and Rulten, C. B. and Sahakian, V. and Sanchez, David M. and Santangelo, Andrea and Schlickeiser, R. and Schuessler, F. and Schulz, A. and Schwanke, U. and Schwarzburg, S. and Schwemmer, S. and Sol, H. and Spengler, G. and Spies, F. and Stawarz, L. and Steenkamp, R. and Stegmann, Christian and Stinzing, F. and Stycz, K. and Sushch, Iurii and Szostek, A. and Tavernet, J. -P. and Terrier, R. and Tluczykont, M. and Trichard, C. and Valerius, K. and van Eldik, C. and Vasileiadis, G. and Venter, C. and Viana, A. and Vincent, P. and Voelk, H. J. and Volpe, F. and Vorster, M. and Wagner, S. J. and Wagner, P. and Ward, M. and Weidinger, M. and Weitzel, Q. and White, R. and Wierzcholska, A. and Willmann, P. and Woernlein, A. and Wouters, D. and Zacharias, M. and Zajczyk, A. and Zdziarski, A. A. and Zech, Alraune and Zechlin, H. -S.}, title = {Discovery of very high energy gamma-ray emission from the BL Lacertae object PKS0301-243 with HESS}, series = {ASTRONOMY \& ASTROPHYSICS}, volume = {559}, journal = {ASTRONOMY \& ASTROPHYSICS}, publisher = {EDP SCIENCES S A}, address = {LES ULIS CEDEX A}, organization = {HESS Collaboration}, issn = {0004-6361}, doi = {10.1051/0004-6361/201321639}, pages = {11}, year = {2013}, abstract = {The active galactic nucleus PKS 0301-243 (z = 0.266) is a high-synchrotron-peaked BL Lac object that is detected at high energies (HE, 100 MeV < E < 100 GeV) by Fermi/LAT. This paper reports on the discovery of PKS 0301-243 at very high energies (E > 100 GeV) by the High Energy Stereoscopic System (H.E.S.S.) from observations between September 2009 and December 2011 for a total live time of 34.9 h. Gamma rays above 200 GeV are detected at a significance of 9.4 sigma. A hint of variability at the 2.5 sigma level is found. An integral flux I(E > 200GeV) = (3.3 +/- 1.1(stat) +/- 0.7(syst)) x 10(-12) ph cm(-2) s(-1) and a photon index Gamma = 4.6 +/- 0.7(stat) +/- 0.2(syst) are measured. Multi-wavelength light curves in HE, X-ray and optical bands show strong variability, and a minimal variability timescale of eight days is estimated from the optical light curve. A single-zone leptonic synchrotron self-Compton scenario satisfactorily reproduces the multi-wavelength data. In this model, the emitting region is out of equipartition and the jet is particle dominated. Because of its high redshift compared to other sources observed at TeV energies, the very high energy emission from PKS 0301-243 is attenuated by the extragalactic background light (EBL) and the measured spectrum is used to derive an upper limit on the opacity of the EBL.}, language = {en} } @book{BauckmannLeserNaumann2010, author = {Bauckmann, Jana and Leser, Ulf and Naumann, Felix}, title = {Efficient and exact computation of inclusion dependencies for data integration}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-048-9}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-41396}, publisher = {Universit{\"a}t Potsdam}, pages = {36}, year = {2010}, abstract = {Data obtained from foreign data sources often come with only superficial structural information, such as relation names and attribute names. Other types of metadata that are important for effective integration and meaningful querying of such data sets are missing. In particular, relationships among attributes, such as foreign keys, are crucial metadata for understanding the structure of an unknown database. The discovery of such relationships is difficult, because in principle for each pair of attributes in the database each pair of data values must be compared. A precondition for a foreign key is an inclusion dependency (IND) between the key and the foreign key attributes. We present with Spider an algorithm that efficiently finds all INDs in a given relational database. It leverages the sorting facilities of DBMS but performs the actual comparisons outside of the database to save computation. Spider analyzes very large databases up to an order of magnitude faster than previous approaches. We also evaluate in detail the effectiveness of several heuristics to reduce the number of necessary comparisons. Furthermore, we generalize Spider to find composite INDs covering multiple attributes, and partial INDs, which are true INDs for all but a certain number of values. This last type is particularly relevant when integrating dirty data as is often the case in the life sciences domain - our driving motivation.}, language = {en} } @article{SchirmerPapenbrockKoumarelasetal.2020, author = {Schirmer, Philipp and Papenbrock, Thorsten and Koumarelas, Ioannis and Naumann, Felix}, title = {Efficient discovery of matching dependencies}, series = {ACM transactions on database systems : TODS}, volume = {45}, journal = {ACM transactions on database systems : TODS}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0362-5915}, doi = {10.1145/3392778}, pages = {33}, year = {2020}, abstract = {Matching dependencies (MDs) are data profiling results that are often used for data integration, data cleaning, and entity matching. They are a generalization of functional dependencies (FDs) matching similar rather than same elements. As their discovery is very difficult, existing profiling algorithms find either only small subsets of all MDs or their scope is limited to only small datasets. We focus on the efficient discovery of all interesting MDs in real-world datasets. For this purpose, we propose HyMD, a novel MD discovery algorithm that finds all minimal, non-trivial MDs within given similarity boundaries. The algorithm extracts the exact similarity thresholds for the individual MDs from the data instead of using predefined similarity thresholds. For this reason, it is the first approach to solve the MD discovery problem in an exact and truly complete way. If needed, the algorithm can, however, enforce certain properties on the reported MDs, such as disjointness and minimum support, to focus the discovery on such results that are actually required by downstream use cases. HyMD is technically a hybrid approach that combines the two most popular dependency discovery strategies in related work: lattice traversal and inference from record pairs. Despite the additional effort of finding exact similarity thresholds for all MD candidates, the algorithm is still able to efficiently process large datasets, e.g., datasets larger than 3 GB.}, language = {en} } @article{KoumarelasKroschkMosleyetal.2018, author = {Koumarelas, Ioannis and Kroschk, Axel and Mosley, Clifford and Naumann, Felix}, title = {Experience: Enhancing address matching with geocoding and similarity measure selection}, series = {Journal of Data and Information Quality}, volume = {10}, journal = {Journal of Data and Information Quality}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3232852}, pages = {1 -- 16}, year = {2018}, abstract = {Given a query record, record matching is the problem of finding database records that represent the same real-world object. In the easiest scenario, a database record is completely identical to the query. However, in most cases, problems do arise, for instance, as a result of data errors or data integrated from multiple sources or received from restrictive form fields. These problems are usually difficult, because they require a variety of actions, including field segmentation, decoding of values, and similarity comparisons, each requiring some domain knowledge. In this article, we study the problem of matching records that contain address information, including attributes such as Street-address and City. To facilitate this matching process, we propose a domain-specific procedure to, first, enrich each record with a more complete representation of the address information through geocoding and reverse-geocoding and, second, to select the best similarity measure per each address attribute that will finally help the classifier to achieve the best f-measure. We report on our experience in selecting geocoding services and discovering similarity measures for a concrete but common industry use-case.}, language = {en} } @article{HackerKrestelGrundmannetal.2020, author = {Hacker, Philipp and Krestel, Ralf and Grundmann, Stefan and Naumann, Felix}, title = {Explainable AI under contract and tort law}, series = {Artificial intelligence and law}, volume = {28}, journal = {Artificial intelligence and law}, number = {4}, publisher = {Springer}, address = {Dordrecht}, issn = {0924-8463}, doi = {10.1007/s10506-020-09260-6}, pages = {415 -- 439}, year = {2020}, abstract = {This paper shows that the law, in subtle ways, may set hitherto unrecognized incentives for the adoption of explainable machine learning applications. In doing so, we make two novel contributions. First, on the legal side, we show that to avoid liability, professional actors, such as doctors and managers, may soon be legally compelled to use explainable ML models. We argue that the importance of explainability reaches far beyond data protection law, and crucially influences questions of contractual and tort liability for the use of ML models. To this effect, we conduct two legal case studies, in medical and corporate merger applications of ML. As a second contribution, we discuss the (legally required) trade-off between accuracy and explainability and demonstrate the effect in a technical case study in the context of spam classification.}, language = {en} }