@phdthesis{Aktas2023, author = {Aktas, Berfin}, title = {Variation in coreference patterns}, doi = {10.25932/publishup-59608}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-596086}, school = {Universit{\"a}t Potsdam}, pages = {xviii, 195}, year = {2023}, abstract = {This thesis explores the variation in coreference patterns across language modes (i.e., spoken and written) and text genres. The significance of research on variation in language use has been emphasized in a number of linguistic studies. For instance, Biber and Conrad [2009] state that "register/genre variation is a fundamental aspect of human language" and "Given the ubiquity of register/genre variation, an understanding of how linguistic features are used in patterned ways across text varieties is of central importance for both the description of particular languages and the development of cross-linguistic theories of language use."[p.23] We examine the variation across genres with the primary goal of contributing to the body of knowledge on the description of language use in English. On the computational side, we believe that incorporating linguistic knowledge into learning-based systems can boost the performance of automatic natural language processing systems, particularly for non-standard texts. Therefore, in addition to their descriptive value, the linguistic findings we provide in this study may prove to be helpful for improving the performance of automatic coreference resolution, which is essential for a good text understanding and beneficial for several downstream NLP applications, including machine translation and text summarization. In particular, we study a genre of texts that is formed of conversational interactions on the well-known social media platform Twitter. Two factors motivate us: First, Twitter conversations are realized in written form but resemble spoken communication [Scheffler, 2017], and therefore they form an atypical genre for the written mode. Second, while Twitter texts are a complicated genre for automatic coreference resolution, due to their widespread use in the digital sphere, at the same time they are highly relevant for applications that seek to extract information or sentiments from users' messages. Thus, we are interested in discovering more about the linguistic and computational aspects of coreference in Twitter conversations. We first created a corpus of such conversations for this purpose and annotated it for coreference. We are interested in not only the coreference patterns but the overall discourse behavior of Twitter conversations. To address this, in addition to the coreference relations, we also annotated the coherence relations on the corpus we compiled. The corpus is available online in a newly developed form that allows for separating the tweets from their annotations. This study consists of three empirical analyses where we independently apply corpus-based, psycholinguistic and computational approaches for the investigation of variation in coreference patterns in a complementary manner. (1) We first make a descriptive analysis of variation across genres through a corpus-based study. We investigate the linguistic aspects of nominal coreference in Twitter conversations and we determine how this genre relates to other text genres in spoken and written modes. In addition to the variation across genres, studying the differences in spoken-written modes is also in focus of linguistic research since from Woolbert [1922]. (2) In order to investigate whether the language mode alone has any effect on coreference patterns, we carry out a crowdsourced experiment and analyze the patterns in the same genre for both spoken and written modes. (3) Finally, we explore the potentials of domain adaptation of automatic coreference resolution (ACR) for the conversational Twitter data. In order to answer the question of how the genre of Twitter conversations relates to other genres in spoken and written modes with respect to coreference patterns, we employ a state-of-the-art neural ACR model [Lee et al., 2018] to examine whether ACR on Twitter conversations will benefit from mode-based separation in out-of-domain training data.}, language = {en} } @article{AktasStede2022, author = {Aktas, Berfin and Stede, Manfred}, title = {Anaphoric distance in oral and written language}, series = {Discours : revue de linguistique, psycholinguistique et informatique}, journal = {Discours : revue de linguistique, psycholinguistique et informatique}, number = {31}, publisher = {Universit{\´e} de Paris-Sorbonne, Maion Recherche}, address = {Paris}, issn = {1963-1723}, doi = {10.4000/discours.12383}, pages = {37}, year = {2022}, abstract = {We investigate the variation in oral and written language in terms of anaphoric distance (i.e., the textual distance between anaphors and their antecedents), expanding corpus-based research with experimental evidence. Contrastive corpus studies demonstrate that oral genres include longer average anaphoric distance than written genres, if the distance is measured in terms of clauses (Fox, 1987; Aktas \& Stede, 2020). We designed an experiment in order to examine the contrasts in oral and written mediums, using the same genre. We aim to gain more insight about the impact of the medium, in a situation where both mediums convey a similar level of spontaneity, informality and interactivity. We designed a story continuation study, where the participants are recruited via crowdsourcing. To our knowledge, this is the first study of its kind, where anaphoric distance is manipulated systematically in a language production experiment in order to examine medium distinctions. We observed that participants use more pronouns in oral medium than in written medium if the anaphoric distance is long. This result is in line with the implications of the earlier corpus-based research. In addition, our results indicate that anaphoric distance has a larger effect in referential choice for the written medium.}, language = {en} }