@article{DraisbachChristenNaumann2019, author = {Draisbach, Uwe and Christen, Peter and Naumann, Felix}, title = {Transforming pairwise duplicates to entity clusters for high-quality duplicate detection}, series = {ACM Journal of Data and Information Quality}, volume = {12}, journal = {ACM Journal of Data and Information Quality}, number = {1}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1936-1955}, doi = {10.1145/3352591}, pages = {1 -- 30}, year = {2019}, abstract = {Duplicate detection algorithms produce clusters of database records, each cluster representing a single real-world entity. As most of these algorithms use pairwise comparisons, the resulting (transitive) clusters can be inconsistent: Not all records within a cluster are sufficiently similar to be classified as duplicate. Thus, one of many subsequent clustering algorithms can further improve the result.
We explain in detail, compare, and evaluate many of these algorithms and introduce three new clustering algorithms in the specific context of duplicate detection. Two of our three new algorithms use the structure of the input graph to create consistent clusters. Our third algorithm, and many other clustering algorithms, focus on the edge weights, instead. For evaluation, in contrast to related work, we experiment on true real-world datasets, and in addition examine in great detail various pair-selection strategies used in practice. While no overall winner emerges, we are able to identify best approaches for different situations. In scenarios with larger clusters, our proposed algorithm, Extended Maximum Clique Clustering (EMCC), and Markov Clustering show the best results. EMCC especially outperforms Markov Clustering regarding the precision of the results and additionally has the advantage that it can also be used in scenarios where edge weights are not available.}, language = {en} } @article{CescaSenDahm2014, author = {Cesca, Simone and Sen, Ali Tolga and Dahm, Torsten}, title = {Seismicity monitoring by cluster analysis of moment tensors}, series = {Geophysical journal international}, volume = {196}, journal = {Geophysical journal international}, number = {3}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {0956-540X}, doi = {10.1093/gji/ggt492}, pages = {1813 -- 1826}, year = {2014}, abstract = {We suggest a new clustering approach to classify focal mechanisms from large moment tensor catalogues, with the purpose of automatically identify families of earthquakes with similar source geometry, recognize the orientation of most active faults, and detect temporal variations of the rupture processes. The approach differs in comparison to waveform similarity methods since clusters are detected even if they occur in large spatial distances. This approach is particularly helpful to analyse large moment tensor catalogues, as in microseismicity applications, where a manual analysis and classification is not feasible. A flexible algorithm is here proposed: it can handle different metrics, norms, and focal mechanism representations. In particular, the method can handle full moment tensor or constrained source model catalogues, for which different metrics are suggested. The method can account for variable uncertainties of different moment tensor components. We verify the method with synthetic catalogues. An application to real data from mining induced seismicity illustrates possible applications of the method and demonstrate the cluster detection and event classification performance with different moment tensor catalogues. Results proof that main earthquake source types occur on spatially separated faults, and that temporal changes in the number and characterization of focal mechanism clusters are detected. We suggest that moment tensor clustering can help assessing time dependent hazard in mines.}, language = {en} } @article{FeherWhelanMueller2012, author = {Feher, Kristen and Whelan, James and M{\"u}ller, Samuel}, title = {Exploring multicollinearity using a random matrix theory approach}, series = {Statistical applications in genetics and molecular biology}, volume = {11}, journal = {Statistical applications in genetics and molecular biology}, number = {3}, publisher = {De Gruyter}, address = {Berlin}, issn = {1544-6115}, doi = {10.1515/1544-6115.1668}, pages = {35}, year = {2012}, abstract = {Clustering of gene expression data is often done with the latent aim of dimension reduction, by finding groups of genes that have a common response to potentially unknown stimuli. However, what is poorly understood to date is the behaviour of a low dimensional signal embedded in high dimensions. This paper introduces a multicollinear model which is based on random matrix theory results, and shows potential for the characterisation of a gene cluster's correlation matrix. This model projects a one dimensional signal into many dimensions and is based on the spiked covariance model, but rather characterises the behaviour of the corresponding correlation matrix. The eigenspectrum of the correlation matrix is empirically examined by simulation, under the addition of noise to the original signal. The simulation results are then used to propose a dimension estimation procedure of clusters from data. Moreover, the simulation results warn against considering pairwise correlations in isolation, as the model provides a mechanism whereby a pair of genes with 'low' correlation may simply be due to the interaction of high dimension and noise. Instead, collective information about all the variables is given by the eigenspectrum.}, language = {en} } @article{ClubbBookhagenRheinwalt2019, author = {Clubb, Fiona J. and Bookhagen, Bodo and Rheinwalt, Aljoscha}, title = {Clustering river profiles to classify geomorphic domains}, series = {Journal of geophysical research : Earth surface}, volume = {124}, journal = {Journal of geophysical research : Earth surface}, number = {6}, publisher = {American Geophysical Union}, address = {Hoboken}, issn = {2169-9003}, doi = {10.1029/2019JF005025}, pages = {1417 -- 1439}, year = {2019}, abstract = {The structure and organization of river networks has been used for decades to investigate the influence of climate and tectonics on landscapes. The majority of these studies either analyze rivers in profile view by extracting channel steepness or calculate planform metrics such as drainage density. However, these techniques rely on the assumption of homogeneity: that intrinsic and external factors are spatially or temporally invariant over the measured profile. This assumption is violated for the majority of Earth's landscapes, where variations in uplift rate, rock strength, climate, and geomorphic process are almost ubiquitous. We propose a method for classifying river profiles to identify landscape regions with similar characteristics by adapting hierarchical clustering algorithms developed for time series data. We first test our clustering on two landscape evolution scenarios and find that we can successfully cluster regions with different erodibility and detect the transient response to sudden base level fall. We then test our method in two real landscapes: first in Bitterroot National Forest, Idaho, where we demonstrate that our method can detect transient incision waves and the topographic signature of fluvial and debris flow process regimes; and second, on Santa Cruz Island, California, where our technique identifies spatial patterns in lithology not detectable through normalized channel steepness analysis. By calculating channel steepness separately for each cluster, our method allows the extraction of more reliable steepness metrics than if calculated for the landscape as a whole. These examples demonstrate the method's ability to disentangle fluvial morphology in complex lithological and tectonic settings.}, language = {en} } @article{FeherWhelanMueller2011, author = {Feher, Kristen and Whelan, James and M{\"u}ller, Samuel}, title = {Assessing modularity using a random matrix theory approach}, series = {Statistical applications in genetics and molecular biology}, volume = {10}, journal = {Statistical applications in genetics and molecular biology}, number = {1}, publisher = {De Gruyter}, address = {Berlin}, issn = {2194-6302}, doi = {10.2202/1544-6115.1667}, pages = {36}, year = {2011}, abstract = {Random matrix theory (RMT) is well suited to describing the emergent properties of systems with complex interactions amongst their constituents through their eigenvalue spectrums. Some RMT results are applied to the problem of clustering high dimensional biological data with complex dependence structure amongst the variables. It will be shown that a gene relevance or correlation network can be constructed by choosing a correlation threshold in a principled way, such that it corresponds to a block diagonal structure in the correlation matrix, if such a structure exists. The structure is then found using community detection algorithms, but with parameter choice guided by RMT predictions. The resulting clustering is compared to a variety of hierarchical clustering outputs and is found to the most generalised result, in that it captures all the features found by the other considered methods.}, language = {en} } @phdthesis{Gruetze2018, author = {Gr{\"u}tze, Toni}, title = {Adding value to text with user-generated content}, school = {Universit{\"a}t Potsdam}, pages = {ii, 114}, year = {2018}, abstract = {In recent years, the ever-growing amount of documents on the Web as well as in closed systems for private or business contexts led to a considerable increase of valuable textual information about topics, events, and entities. It is a truism that the majority of information (i.e., business-relevant data) is only available in unstructured textual form. The text mining research field comprises various practice areas that have the common goal of harvesting high-quality information from textual data. These information help addressing users' information needs. In this thesis, we utilize the knowledge represented in user-generated content (UGC) originating from various social media services to improve text mining results. These social media platforms provide a plethora of information with varying focuses. In many cases, an essential feature of such platforms is to share relevant content with a peer group. Thus, the data exchanged in these communities tend to be focused on the interests of the user base. The popularity of social media services is growing continuously and the inherent knowledge is available to be utilized. We show that this knowledge can be used for three different tasks. Initially, we demonstrate that when searching persons with ambiguous names, the information from Wikipedia can be bootstrapped to group web search results according to the individuals occurring in the documents. We introduce two models and different means to handle persons missing in the UGC source. We show that the proposed approaches outperform traditional algorithms for search result clustering. Secondly, we discuss how the categorization of texts according to continuously changing community-generated folksonomies helps users to identify new information related to their interests. We specifically target temporal changes in the UGC and show how they influence the quality of different tag recommendation approaches. Finally, we introduce an algorithm to attempt the entity linking problem, a necessity for harvesting entity knowledge from large text collections. The goal is the linkage of mentions within the documents with their real-world entities. A major focus lies on the efficient derivation of coherent links. For each of the contributions, we provide a wide range of experiments on various text corpora as well as different sources of UGC. The evaluation shows the added value that the usage of these sources provides and confirms the appropriateness of leveraging user-generated content to serve different information needs.}, language = {en} }