@phdthesis{Aktas2023, author = {Aktas, Berfin}, title = {Variation in coreference patterns}, doi = {10.25932/publishup-59608}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-596086}, school = {Universit{\"a}t Potsdam}, pages = {xviii, 195}, year = {2023}, abstract = {This thesis explores the variation in coreference patterns across language modes (i.e., spoken and written) and text genres. The significance of research on variation in language use has been emphasized in a number of linguistic studies. For instance, Biber and Conrad [2009] state that "register/genre variation is a fundamental aspect of human language" and "Given the ubiquity of register/genre variation, an understanding of how linguistic features are used in patterned ways across text varieties is of central importance for both the description of particular languages and the development of cross-linguistic theories of language use."[p.23] We examine the variation across genres with the primary goal of contributing to the body of knowledge on the description of language use in English. On the computational side, we believe that incorporating linguistic knowledge into learning-based systems can boost the performance of automatic natural language processing systems, particularly for non-standard texts. Therefore, in addition to their descriptive value, the linguistic findings we provide in this study may prove to be helpful for improving the performance of automatic coreference resolution, which is essential for a good text understanding and beneficial for several downstream NLP applications, including machine translation and text summarization. In particular, we study a genre of texts that is formed of conversational interactions on the well-known social media platform Twitter. Two factors motivate us: First, Twitter conversations are realized in written form but resemble spoken communication [Scheffler, 2017], and therefore they form an atypical genre for the written mode. Second, while Twitter texts are a complicated genre for automatic coreference resolution, due to their widespread use in the digital sphere, at the same time they are highly relevant for applications that seek to extract information or sentiments from users' messages. Thus, we are interested in discovering more about the linguistic and computational aspects of coreference in Twitter conversations. We first created a corpus of such conversations for this purpose and annotated it for coreference. We are interested in not only the coreference patterns but the overall discourse behavior of Twitter conversations. To address this, in addition to the coreference relations, we also annotated the coherence relations on the corpus we compiled. The corpus is available online in a newly developed form that allows for separating the tweets from their annotations. This study consists of three empirical analyses where we independently apply corpus-based, psycholinguistic and computational approaches for the investigation of variation in coreference patterns in a complementary manner. (1) We first make a descriptive analysis of variation across genres through a corpus-based study. We investigate the linguistic aspects of nominal coreference in Twitter conversations and we determine how this genre relates to other text genres in spoken and written modes. In addition to the variation across genres, studying the differences in spoken-written modes is also in focus of linguistic research since from Woolbert [1922]. (2) In order to investigate whether the language mode alone has any effect on coreference patterns, we carry out a crowdsourced experiment and analyze the patterns in the same genre for both spoken and written modes. (3) Finally, we explore the potentials of domain adaptation of automatic coreference resolution (ACR) for the conversational Twitter data. In order to answer the question of how the genre of Twitter conversations relates to other genres in spoken and written modes with respect to coreference patterns, we employ a state-of-the-art neural ACR model [Lee et al., 2018] to examine whether ACR on Twitter conversations will benefit from mode-based separation in out-of-domain training data.}, language = {en} }