@misc{RepkeKrestelEddingetal.2018, author = {Repke, Tim and Krestel, Ralf and Edding, Jakob and Hartmann, Moritz and Hering, Jonas and Kipping, Dennis and Schmidt, Hendrik and Scordialo, Nico and Zenner, Alexander}, title = {Beacon in the Dark}, series = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, journal = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management}, publisher = {Association for Computing Machinery}, address = {New York}, isbn = {978-1-4503-6014-2}, doi = {10.1145/3269206.3269231}, pages = {1871 -- 1874}, year = {2018}, abstract = {The large amount of heterogeneous data in these email corpora renders experts' investigations by hand infeasible. Auditors or journalists, e.g., who are looking for irregular or inappropriate content or suspicious patterns, are in desperate need for computer-aided exploration tools to support their investigations. We present our Beacon system for the exploration of such corpora at different levels of detail. A distributed processing pipeline combines text mining methods and social network analysis to augment the already semi-structured nature of emails. The user interface ties into the resulting cleaned and enriched dataset. For the interface design we identify three objectives expert users have: gain an initial overview of the data to identify leads to investigate, understand the context of the information at hand, and have meaningful filters to iteratively focus onto a subset of emails. To this end we make use of interactive visualisations based on rearranged and aggregated extracted information to reveal salient patterns.}, language = {en} } @phdthesis{Repke2022, author = {Repke, Tim}, title = {Machine-learning-assisted corpus exploration and visualisation}, doi = {10.25932/publishup-56263}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-562636}, school = {Universit{\"a}t Potsdam}, pages = {xii, 131}, year = {2022}, abstract = {Text collections, such as corpora of books, research articles, news, or business documents are an important resource for knowledge discovery. Exploring large document collections by hand is a cumbersome but necessary task to gain new insights and find relevant information. Our digitised society allows us to utilise algorithms to support the information seeking process, for example with the help of retrieval or recommender systems. However, these systems only provide selective views of the data and require some prior knowledge to issue meaningful queries and asses a system's response. The advancements of machine learning allow us to reduce this gap and better assist the information seeking process. For example, instead of sighting countless business documents by hand, journalists and investigator scan employ natural language processing techniques, such as named entity recognition. Al-though this greatly improves the capabilities of a data exploration platform, the wealth of information is still overwhelming. An overview of the entirety of a dataset in the form of a two-dimensional map-like visualisation may help to circumvent this issue. Such overviews enable novel interaction paradigms for users, which are similar to the exploration of digital geographical maps. In particular, they can provide valuable context by indicating how apiece of information fits into the bigger picture.This thesis proposes algorithms that appropriately pre-process heterogeneous documents and compute the layout for datasets of all kinds. Traditionally, given high-dimensional semantic representations of the data, so-called dimensionality reduction algorithms are usedto compute a layout of the data on a two-dimensional canvas. In this thesis, we focus on text corpora and go beyond only projecting the inherent semantic structure itself. Therefore,we propose three dimensionality reduction approaches that incorporate additional information into the layout process: (1) a multi-objective dimensionality reduction algorithm to jointly visualise semantic information with inherent network information derived from the underlying data; (2) a comparison of initialisation strategies for different dimensionality reduction algorithms to generate a series of layouts for corpora that grow and evolve overtime; (3) and an algorithm that updates existing layouts by incorporating user feedback provided by pointwise drag-and-drop edits. This thesis also contains system prototypes to demonstrate the proposed technologies, including pre-processing and layout of the data and presentation in interactive user interfaces.}, language = {en} }