@misc{KruseKaoudiQuianeRuizetal.2019, author = {Kruse, Sebastian and Kaoudi, Zoi and Quiane-Ruiz, Jorge-Arnulfo and Chawla, Sanjay and Naumann, Felix and Contreras-Rojas, Bertty}, title = {Optimizing Cross-Platform Data Movement}, series = {2019 IEEE 35th International Conference on Data Engineering (ICDE)}, journal = {2019 IEEE 35th International Conference on Data Engineering (ICDE)}, publisher = {IEEE}, address = {New York}, isbn = {978-1-5386-7474-1}, issn = {1084-4627}, doi = {10.1109/ICDE.2019.00162}, pages = {1642 -- 1645}, year = {2019}, abstract = {Data analytics are moving beyond the limits of a single data processing platform. A cross-platform query optimizer is necessary to enable applications to run their tasks over multiple platforms efficiently and in a platform-agnostic manner. For the optimizer to be effective, it must consider data movement costs across different data processing platforms. In this paper, we present the graph-based data movement strategy used by RHEEM, our open-source cross-platform system. In particular, we (i) model the data movement problem as a new graph problem, which we prove to be NP-hard, and (ii) propose a novel graph exploration algorithm, which allows RHEEM to discover multiple hidden opportunities for cross-platform data processing.}, language = {en} } @article{VitaglianoJiangNaumann2021, author = {Vitagliano, Gerardo and Jiang, Lan and Naumann, Felix}, title = {Detecting layout templates in complex multiregion files}, series = {Proceedings of the VLDB Endowment}, volume = {15}, journal = {Proceedings of the VLDB Endowment}, number = {3}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {2150-8097}, doi = {10.14778/3494124.3494145}, pages = {646 -- 658}, year = {2021}, abstract = {Spreadsheets are among the most commonly used file formats for data management, distribution, and analysis. Their widespread employment makes it easy to gather large collections of data, but their flexible canvas-based structure makes automated analysis difficult without heavy preparation. One of the common problems that practitioners face is the presence of multiple, independent regions in a single spreadsheet, possibly separated by repeated empty cells. We define such files as "multiregion" files. In collections of various spreadsheets, we can observe that some share the same layout. We present the Mondrian approach to automatically identify layout templates across multiple files and systematically extract the corresponding regions. Our approach is composed of three phases: first, each file is rendered as an image and inspected for elements that could form regions; then, using a clustering algorithm, the identified elements are grouped to form regions; finally, every file layout is represented as a graph and compared with others to find layout templates. We compare our method to state-of-the-art table recognition algorithms on two corpora of real-world enterprise spreadsheets. Our approach shows the best performances in detecting reliable region boundaries within each file and can correctly identify recurring layouts across files.}, language = {en} } @misc{KruseKaoudiContrerasRojasetal.2020, author = {Kruse, Sebastian and Kaoudi, Zoi and Contreras-Rojas, Bertty and Chawla, Sanjay and Naumann, Felix and Quian{\´e}-Ruiz, Jorge-Arnulfo}, title = {RHEEMix in the data jungle}, series = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, journal = {Zweitver{\"o}ffentlichungen der Universit{\"a}t Potsdam : Reihe der Digital Engineering Fakult{\"a}t}, number = {6}, doi = {10.25932/publishup-51944}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-519443}, pages = {26}, year = {2020}, abstract = {Data analytics are moving beyond the limits of a single platform. In this paper, we present the cost-based optimizer of Rheem, an open-source cross-platform system that copes with these new requirements. The optimizer allocates the subtasks of data analytic tasks to the most suitable platforms. Our main contributions are: (i) a mechanism based on graph transformations to explore alternative execution strategies; (ii) a novel graph-based approach to determine efficient data movement plans among subtasks and platforms; and (iii) an efficient plan enumeration algorithm, based on a novel enumeration algebra. We extensively evaluate our optimizer under diverse real tasks. We show that our optimizer can perform tasks more than one order of magnitude faster when using multiple platforms than when using a single platform.}, language = {en} } @article{KruseKaoudiContrerasRojasetal.2020, author = {Kruse, Sebastian and Kaoudi, Zoi and Contreras-Rojas, Bertty and Chawla, Sanjay and Naumann, Felix and Quiane-Ruiz, Jorge-Arnulfo}, title = {RHEEMix in the data jungle}, series = {The VLDB Journal}, volume = {29}, journal = {The VLDB Journal}, number = {6}, publisher = {Springer}, address = {Berlin}, issn = {1066-8888}, doi = {10.1007/s00778-020-00612-x}, pages = {1287 -- 1310}, year = {2020}, abstract = {Data analytics are moving beyond the limits of a single platform. In this paper, we present the cost-based optimizer of Rheem, an open-source cross-platform system that copes with these new requirements. The optimizer allocates the subtasks of data analytic tasks to the most suitable platforms. Our main contributions are: (i) a mechanism based on graph transformations to explore alternative execution strategies; (ii) a novel graph-based approach to determine efficient data movement plans among subtasks and platforms; and (iii) an efficient plan enumeration algorithm, based on a novel enumeration algebra. We extensively evaluate our optimizer under diverse real tasks. We show that our optimizer can perform tasks more than one order of magnitude faster when using multiple platforms than when using a single platform.}, language = {en} } @book{MeinelPlattnerDoellneretal.2014, author = {Meinel, Christoph and Plattner, Hasso and D{\"o}llner, J{\"u}rgen Roland Friedrich and Weske, Mathias and Polze, Andreas and Hirschfeld, Robert and Naumann, Felix and Giese, Holger and Baudisch, Patrick}, title = {Proceedings of the 7th Ph.D. Retreat of the HPI Research School on Service-oriented Systems Engineering}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-273-5}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-63490}, publisher = {Universit{\"a}t Potsdam}, pages = {ii, 218}, year = {2014}, abstract = {Design and Implementation of service-oriented architectures imposes a huge number of research questions from the fields of software engineering, system analysis and modeling, adaptability, and application integration. Component orientation and web services are two approaches for design and realization of complex web-based system. Both approaches allow for dynamic application adaptation as well as integration of enterprise application. Commonly used technologies, such as J2EE and .NET, form de facto standards for the realization of complex distributed systems. Evolution of component systems has lead to web services and service-based architectures. This has been manifested in a multitude of industry standards and initiatives such as XML, WSDL UDDI, SOAP, etc. All these achievements lead to a new and promising paradigm in IT systems engineering which proposes to design complex software solutions as collaboration of contractually defined software services. Service-Oriented Systems Engineering represents a symbiosis of best practices in object-orientation, component-based development, distributed computing, and business process management. It provides integration of business and IT concerns. The annual Ph.D. Retreat of the Research School provides each member the opportunity to present his/her current state of their research and to give an outline of a prospective Ph.D. thesis. Due to the interdisciplinary structure of the Research Scholl, this technical report covers a wide range of research topics. These include but are not limited to: Self-Adaptive Service-Oriented Systems, Operating System Support for Service-Oriented Systems, Architecture and Modeling of Service-Oriented Systems, Adaptive Process Management, Services Composition and Workflow Planning, Security Engineering of Service-Based IT Systems, Quantitative Analysis and Optimization of Service-Oriented Systems, Service-Oriented Systems in 3D Computer Graphics sowie Service-Oriented Geoinformatics.}, language = {en} } @article{LosterKoumarelasNaumann2021, author = {Loster, Michael and Koumarelas, Ioannis and Naumann, Felix}, title = {Knowledge transfer for entity resolution with siamese neural networks}, series = {ACM journal of data and information quality}, volume = {13}, journal = {ACM journal of data and information quality}, number = {1}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3410157}, pages = {25}, year = {2021}, abstract = {The integration of multiple data sources is a common problem in a large variety of applications. Traditionally, handcrafted similarity measures are used to discover, merge, and integrate multiple representations of the same entity-duplicates-into a large homogeneous collection of data. Often, these similarity measures do not cope well with the heterogeneity of the underlying dataset. In addition, domain experts are needed to manually design and configure such measures, which is both time-consuming and requires extensive domain expertise.
We propose a deep Siamese neural network, capable of learning a similarity measure that is tailored to the characteristics of a particular dataset. With the properties of deep learning methods, we are able to eliminate the manual feature engineering process and thus considerably reduce the effort required for model construction. In addition, we show that it is possible to transfer knowledge acquired during the deduplication of one dataset to another, and thus significantly reduce the amount of data required to train a similarity measure. We evaluated our method on multiple datasets and compare our approach to state-of-the-art deduplication methods. Our approach outperforms competitors by up to +26 percent F-measure, depending on task and dataset. In addition, we show that knowledge transfer is not only feasible, but in our experiments led to an improvement in F-measure of up to +4.7 percent.}, language = {en} } @book{AbedjanNaumann2011, author = {Abedjan, Ziawasch and Naumann, Felix}, title = {Advancing the discovery of unique column combinations}, publisher = {Universit{\"a}tsverlag Potsdam}, address = {Potsdam}, isbn = {978-3-86956-148-6}, issn = {1613-5652}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-53564}, publisher = {Universit{\"a}t Potsdam}, pages = {25}, year = {2011}, abstract = {Unique column combinations of a relational database table are sets of columns that contain only unique values. Discovering such combinations is a fundamental research problem and has many different data management and knowledge discovery applications. Existing discovery algorithms are either brute force or have a high memory load and can thus be applied only to small datasets or samples. In this paper, the wellknown GORDIAN algorithm and "Apriori-based" algorithms are compared and analyzed for further optimization. We greatly improve the Apriori algorithms through efficient candidate generation and statistics-based pruning methods. A hybrid solution HCAGORDIAN combines the advantages of GORDIAN and our new algorithm HCA, and it significantly outperforms all previous work in many situations.}, language = {en} } @article{BonnetDongNaumannetal.2021, author = {Bonnet, Philippe and Dong, Xin Luna and Naumann, Felix and T{\"o}z{\"u}n, P{\i}nar}, title = {VLDB 2021}, series = {SIGMOD record}, volume = {50}, journal = {SIGMOD record}, number = {4}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {0163-5808}, doi = {10.1145/3516431.3516447}, pages = {50 -- 53}, year = {2021}, abstract = {The 47th International Conference on Very Large Databases (VLDB'21) was held on August 16-20, 2021 as a hybrid conference. It attracted 180 in-person attendees in Copenhagen and 840 remote attendees. In this paper, we describe our key decisions as general chairs and program committee chairs and share the lessons we learned.}, language = {en} } @article{AbramowskiAceroAharonianetal.2013, author = {Abramowski, Attila and Acero, F. and Aharonian, Felix A. and Akhperjanian, A. G. and Ang{\"u}ner, Ekrem Oǧuzhan and Anton, Gisela and Balenderan, Shangkari and Balzer, Arnim and Barnacka, Anna and Becherini, Yvonne and Tjus, J. Becker and Bernl{\"o}hr, K. and Birsin, E. and Bissaldi, E. and Biteau, Jonathan and Boisson, Catherine and Bolmont, J. and Bordas, Pol and Brucker, J. and Brun, Francois and Brun, Pierre and Bulik, Tomasz and Carrigan, Svenja and Casanova, Sabrina and Cerruti, M. and Chadwick, Paula M. and Chalme-Calvet, R. and Chaves, Ryan C. G. and Cheesebrough, A. and Chretien, M. and Colafrancesco, Sergio and Cologna, Gabriele and Conrad, Jan and Couturier, C. and Dalton, M. and Daniel, M. K. and Davids, I. D. and Degrange, B. and Deil, C. and deWilt, P. and Dickinson, H. J. and Djannati-Ata{\"i}, A. and Domainko, W. and Drury, L. O'C. and Dubus, G. and Dutson, K. and Dyks, J. and Dyrda, M. and Edwards, T. and Egberts, Kathrin and Eger, P. and Espigat, P. and Farnier, C. and Fegan, S. and Feinstein, F. and Fernandes, M. V. and Fernandez, D. and Fiasson, A. and Fontaine, G. and Foerster, A. and Fuessling, M. and Gajdus, M. and Gallant, Y. A. and Garrigoux, T. and Gast, H. and Giebels, B. and Glicenstein, J. F. and Goering, D. and Grondin, M. -H. and Grudzinska, M. and Haeffner, S. and Hague, J. D. and Hahn, J. and Harris, J. and Heinzelmann, G. and Henri, G. and Hermann, G. and Hervet, O. and Hillert, A. and Hinton, James Anthony and Hofmann, W. and Hofverberg, P. and Holler, Markus and Horns, D. and Jacholkowska, A. and Jahn, C. and Jamrozy, M. and Janiak, M. and Jankowsky, F. and Jung, I. and Kastendieck, M. A. and Katarzynski, K. and Katz, U. and Kaufmann, S. and Khelifi, B. and Kieffer, M. and Klepser, S. and Klochkov, D. and Kluzniak, W. and Kneiske, T. and Kolitzus, D. and Komin, Nu. and Kosack, K. and Krakau, S. and Krayzel, F. and Krueger, P. P. and Laffon, H. and Lamanna, G. and Lefaucheur, J. and Lemoine-Goumard, M. and Lenain, J. -P. and Lennarz, D. and Lohse, T. and Lopatin, A. and Lu, C. -C. and Marandon, V. and Marcowith, Alexandre and Maxted, N. and Mayer, M. and McComb, T. J. L. and Medina, M. C. and Mehault, J. and Menzler, U. and Meyer, M. and Moderski, R. and Mohamed, M. and Moulin, Emmanuel and Murach, T. and Naumann, C. L. and de Naurois, M. and Nedbal, D. and Niemiec, J. and Nolan, S. J. and Oakes, L. and Ohm, S. and Wilhelmi, E. de Ona and Opitz, B. and Ostrowski, M. and Oya, I. and Panter, M. and Parsons, R. D. and Arribas, M. Paz and Pekeur, N. W. and Pelletier, G. and Perez, J. and Petrucci, P. -O. and Peyaud, B. and Pita, S. and Poon, H. and Punch, M. and Quirrenbach, A. and Raab, S. and Raue, M. and Reimer, A. and Reimer, O. and Renaud, M. and de los Reyes, R. and Rieger, F. and Rob, L. and Rosier-Lees, S. and Rowell, G. and Rudak, B. and Rulten, C. B. and Sahakian, V. and Sanchez, David M. and Santangelo, Andrea and Schlickeiser, R. and Schuessler, F. and Schulz, A. and Schwanke, U. and Schwarzburg, S. and Schwemmer, S. and Sol, H. and Spengler, G. and Spiess, F. and Stawarz, L. and Steenkamp, R. and Stegmann, Christian and Stinzing, F. and Stycz, K. and Sushch, Iurii and Szostek, A. and Tavernet, J. -P. and Terrier, R. and Tluczykont, M. and Trichard, C. and Valerius, K. and van Eldik, C. and Vasileiadis, G. and Venter, C. and Viana, A. and Vincent, P. and Voelk, H. J. and Volpe, F. and Vorster, M. and Wagner, S. J. and Wagner, P. and Ward, M. and Weidinger, M. and White, R. and Wierzcholska, A. and Willmann, P. and Woernlein, A. and Wouters, D. and Zacharias, M. and Zajczyk, A. and Zdziarski, A. A. and Zech, Alraune and Zechlin, H. -S.}, title = {Discovery of high and very high-energy emission from the BL Lacertae object SHBL J001355.9-185406}, series = {Astronomy and astrophysics : an international weekly journal}, volume = {554}, journal = {Astronomy and astrophysics : an international weekly journal}, publisher = {EDP Sciences}, address = {Les Ulis}, organization = {HESSCollaboration}, issn = {0004-6361}, doi = {10.1051/0004-6361/201220996}, pages = {8}, year = {2013}, abstract = {The detection of the high-frequency peaked BL Lac object (HBL) SHBL J001355.9-185406 (z = 0.095) at high (HE; 100 MeV < E < 300 GeV) and very high-energy (VHE; E > 100 GeV) with the Fermi Large Area Telescope (LAT) and the High Energy Stereoscopic System (H.E.S.S.) is reported. Dedicated observations were performed with the H. E. S. S. telescopes, leading to a detection at the 5.5 sigma significance level. The measured flux above 310 GeV is (8.3 +/- 1.7(stat) +/- 1.7(sys)) x 10(-13) photons cm(-2) s(-1) (about 0.6\% of that of the Crab Nebula), and the power-law spectrum has a photon index of Gamma = 3.4 +/- 0.5(stat) +/- 0.2(sys). Using 3.5 years of publicly available Fermi-LAT data, a faint counterpart has been detected in the LAT data at the 5.5 sigma significance level, with an integrated flux above 300 MeV of (9.3 +/- 3.4(stat) +/- 0.8(sys)) x 10(-10) photons cm(-2) s(-1) and a photon index of Gamma = 1.96 +/- 0.20(stat) +/- 0.08(sys). X-ray observations with Swift-XRT allow the synchrotron peak energy in vF(v) representation to be located at similar to 1.0 keV. The broadband spectral energy distribution is modelled with a one-zone synchrotron self-Compton (SSC) model and the optical data by a black-body emission describing the thermal emission of the host galaxy. The derived parameters are typical of HBLs detected at VHE, with a particle-dominated jet.}, language = {en} } @article{AbramowskiAharonianBenkhalietal.2014, author = {Abramowski, Attila and Aharonian, Felix A. and Benkhali, Faical Ait and Akhperjanian, A. G. and Ang{\"u}ner, Ekrem Oǧuzhan and Anton, Gisela and Balenderan, Shangkari and Balzer, Arnim and Barnacka, Anna and Becherini, Yvonne and Tjus, J. Becker and Bernl{\"o}hr, K. and Birsin, E. and Bissaldi, E. and Biteau, Jonathan and Boettcher, Markus and Boisson, Catherine and Bolmont, J. and Bordas, Pol and Brucker, J. and Brun, Francois and Brun, Pierre and Bulik, Tomasz and Carrigan, Svenja and Casanova, Sabrina and Cerruti, M. and Chadwick, Paula M. and Chalme-Calvet, R. and Chaves, Ryan C. G. and Cheesebrough, A. and Chretien, M. and Colafrancesco, Sergio and Cologna, Gabriele and Conrad, Jan and Couturier, C. and Cui, Y. and Dalton, M. and Daniel, M. K. and Davids, I. D. and Degrange, B. and Deil, C. and dewilt, P. and Dicldnson, H. J. and Djannati-Ata{\"i}, A. and Domainko, W. and Dubus, G. and Dutson, K. and Dyks, J. and Dyrda, M. and Edwards, T. and Egberts, Kathrin and Eger, P. and Espigat, P. and Farnier, C. and Fegan, S. and Feinstein, F. and Fernandes, M. V. and Fernandez, D. and Fiasson, A. and Fontaine, G. and Foerster, A. and Fuessling, M. and Gajdus, M. and Gallant, Y. A. and Garrigoux, T. and Giavitto, G. and Giebels, B. and Glicenstein, J. F. and Grondin, M. -H. and Grudzinska, M. and Haeffner, S. and Hahn, J. and Harris, J. and Heinzelmann, G. and Henri, G. and Hermann, G. and Hervet, O. and Hillert, A. and Hinton, James Anthony and Hofmann, W. and Hofverberg, P. and Holler, M. and Horns, D. and Jacholkowska, A. and Jahn, C. and Jamrozy, M. and Janiak, M. and Jankowsky, F. and Jung, I. and Kastendieck, M. A. and Katarzynski, K. and Katz, U. and Kaufmann, S. and Khelifi, B. and Kieffer, M. and Klepser, S. and Klochkov, D. and Kluzniak, W. and Kneiske, T. and Kolitzus, D. and Komin, Nu. and Kosack, K. and Krakau, S. and Krayzel, F. and Krueger, P. P. and Laffon, H. and Lamanna, G. and Lefaucheur, J. and Lemiere, A. and Lemoine-Goumard, M. and Lenain, J. -P. and Lennarz, D. and Lohse, T. and Lopatin, A. and Lu, C. -C. and Marandon, V. and Marcowith, Alexandre and Marx, R. and Maurin, G. and Maxted, N. and Mayer, M. and McComb, T. J. L. and Mehault, J. and Meintjes, P. J. and Menzler, U. and Meyer, M. and Moderski, R. and Mohamed, M. and Moulin, Emmanuel and Murach, T. and Naumann, C. L. and de Naurois, M. and Niemiec, J. and Nolan, S. J. and Oakes, L. and Ohm, S. and Wilhelmi, E. de Ona and Opitz, B. and Ostrowski, M. and Oya, I. and Panter, M. and Parsons, R. D. and Arribas, M. Paz and Pekeur, N. W. and Pelletier, G. and Perez, J. and Petrucci, P. -O. and Peyaud, B. and Pita, S. and Poon, H. and Puehlhofer, G. and Punch, M. and Quirrenbach, A. and Raab, S. and Raue, M. and Reimer, A. and Reimer, O. and Renaud, M. and Reyes, R. de Los and Rieger, F. and Rob, L. and Romoli, C. and Rosier-Lees, S. and Rowell, G. and Rudak, B. and Rulten, C. B. and Sahakian, V. and Sanchez, David M. and Santangelo, Andrea and Schlickeiser, R. and Schuessler, F. and Schulz, A. and Schwanke, U. and Schwarzburg, S. and Schwemmer, S. and Sol, H. and Spengler, G. and Spies, F. and Stawarz, L. and Steenkamp, R. and Stegmann, Christian and Stinzing, F. and Stycz, K. and Sushch, Iurii and Szostek, A. and Tavernet, J. -P. and Tavernier, T. and Taylor, A. M. and Terrier, R. and Tluczykont, M. and Trichard, C. and Valerius, K. and van Eldik, C. and van Soelen, B. and Vasileiadis, G. and Venter, C. and Viana, A. and Vincent, P. and Voelk, H. J. and Volpe, F. and Vorster, M. and Vuillaume, T. and Wagner, S. J. and Wagner, P. and Ward, M. and Weidinger, M. and Weitzel, Q. and White, R. and Wierzcholska, A. and Willmann, P. and Woernlein, A. and Wouters, D. and Zabalza, V. and Zacharias, M. and Zajczyk, A. and Zdziarski, A. A. and Zech, Alraune and Zechlin, H. -S.}, title = {Flux upper limits for 47 AGN observed with HESS in 2004-2011}, series = {Astronomy and astrophysics : an international weekly journal}, volume = {564}, journal = {Astronomy and astrophysics : an international weekly journal}, publisher = {EDP Sciences}, address = {Les Ulis}, organization = {HESS Collaboration}, issn = {0004-6361}, doi = {10.1051/0004-6361/201322897}, pages = {10}, year = {2014}, abstract = {Context. About 40\% of the observation time of the High Energy Stereoscopic System (H.E.S.S.) is dedicated to studying active galactic nuclei (AGN), with the aim of increasing the sample of known extragalactic very-high-energy (VHE, E > 100 GeV) sources and constraining the physical processes at play in potential emitters. Aims. H.E.S.S. observations of AGN, spanning a period from April 2004 to December 2011, are investigated to constrain their gamma-ray fluxes. Only the 47 sources without significant excess detected at the position of the targets are presented. Methods. Upper limits on VHE fluxes of the targets were computed and a search for variability was performed on the nightly time scale. Results. For 41 objects, the flux upper limits we derived are the most constraining reported to date. These constraints at VHE are compared with the flux level expected from extrapolations of Fermi-LAT measurements in the two-year catalog of AGN. The H.E.S.S. upper limits are at least a factor of two lower than the extrapolated Fermi-LAT fluxes for 11 objects Taking into account the attenuation by the extragalactic background light reduces the tension for all but two of them, suggesting intrinsic curvature in the high-energy spectra of these two AGN. Conclusions. Compilation efforts led by current VHE instruments are of critical importance for target-selection strategies before the advent of the Cherenkov Telescope Array (CTA).}, language = {en} }