@article{WulffMientusNowaketal.2022, author = {Wulff, Peter and Mientus, Lukas and Nowak, Anna and Borowski, Andreas}, title = {Utilizing a pretrained language model (BERT) to classify preservice physics teachers' written reflections}, series = {International journal of artificial intelligence in education}, journal = {International journal of artificial intelligence in education}, number = {33}, publisher = {Springer}, address = {New York}, issn = {1560-4292}, doi = {10.1007/s40593-022-00290-6}, pages = {439 -- 466}, year = {2022}, abstract = {Computer-based analysis of preservice teachers' written reflections could enable educational scholars to design personalized and scalable intervention measures to support reflective writing. Algorithms and technologies in the domain of research related to artificial intelligence have been found to be useful in many tasks related to reflective writing analytics such as classification of text segments. However, mostly shallow learning algorithms have been employed so far. This study explores to what extent deep learning approaches can improve classification performance for segments of written reflections. To do so, a pretrained language model (BERT) was utilized to classify segments of preservice physics teachers' written reflections according to elements in a reflection-supporting model. Since BERT has been found to advance performance in many tasks, it was hypothesized to enhance classification performance for written reflections as well. We also compared the performance of BERT with other deep learning architectures and examined conditions for best performance. We found that BERT outperformed the other deep learning architectures and previously reported performances with shallow learning algorithms for classification of segments of reflective writing. BERT starts to outperform the other models when trained on about 20 to 30\% of the training data. Furthermore, attribution analyses for inputs yielded insights into important features for BERT's classification decisions. Our study indicates that pretrained language models such as BERT can boost performance for language-related tasks in educational contexts such as classification.}, language = {en} } @article{MadrugadeBritoOttoKuhlicke2021, author = {Madruga de Brito, Mariana and Otto, Danny and Kuhlicke, Christian}, title = {Tracking topics and frames regarding sustainability transformations during the onset of the COVID-19 crisis}, series = {Sustainability / Multidisciplinary Digital Publishing Institute (MDPI)}, volume = {13}, journal = {Sustainability / Multidisciplinary Digital Publishing Institute (MDPI)}, number = {19}, publisher = {MDPI}, address = {Basel}, issn = {2071-1050}, doi = {10.3390/su131911095}, pages = {19}, year = {2021}, abstract = {Many researchers and politicians believe that the COVID-19 crisis may have opened a "window of opportunity " to spur sustainability transformations. Still, evidence for such a dynamic is currently lacking. Here, we propose the linkage of "big data " and "thick data " methods for monitoring debates on transformation processes by following the COVID-19 discourse on ecological sustainability in Germany. We analysed variations in the topics discussed by applying text mining techniques to a corpus with 84,500 newspaper articles published during the first COVID-19 wave. This allowed us to attain a unique and previously inaccessible "bird's eye view " of how these topics evolved. To deepen our understanding of prominent frames, a qualitative content analysis was undertaken. Furthermore, we investigated public awareness by analysing online search behaviour. The findings show an underrepresentation of sustainability topics in the German news during the early stages of the crisis. Similarly, public awareness regarding climate change was found to be reduced. Nevertheless, by examining the newspaper data in detail, we found that the pandemic is often seen as a chance for sustainability transformations-but not without a set of challenges. Our mixed-methods approach enabled us to bridge knowledge gaps between qualitative and quantitative research by "thickening " and providing context to data-driven analyses. By monitoring whether or not the current crisis is seen as a chance for sustainability transformations, we provide insights for environmental policy in times of crisis.}, language = {en} } @phdthesis{Sidarenka2019, author = {Sidarenka, Uladzimir}, title = {Sentiment analysis of German Twitter}, doi = {10.25932/publishup-43742}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-437422}, school = {Universit{\"a}t Potsdam}, pages = {vii, 217}, year = {2019}, abstract = {The immense popularity of online communication services in the last decade has not only upended our lives (with news spreading like wildfire on the Web, presidents announcing their decisions on Twitter, and the outcome of political elections being determined on Facebook) but also dramatically increased the amount of data exchanged on these platforms. Therefore, if we wish to understand the needs of modern society better and want to protect it from new threats, we urgently need more robust, higher-quality natural language processing (NLP) applications that can recognize such necessities and menaces automatically, by analyzing uncensored texts. Unfortunately, most NLP programs today have been created for standard language, as we know it from newspapers, or, in the best case, adapted to the specifics of English social media. This thesis reduces the existing deficit by entering the new frontier of German online communication and addressing one of its most prolific forms—users' conversations on Twitter. In particular, it explores the ways and means by how people express their opinions on this service, examines current approaches to automatic mining of these feelings, and proposes novel methods, which outperform state-of-the-art techniques. For this purpose, I introduce a new corpus of German tweets that have been manually annotated with sentiments, their targets and holders, as well as lexical polarity items and their contextual modifiers. Using these data, I explore four major areas of sentiment research: (i) generation of sentiment lexicons, (ii) fine-grained opinion mining, (iii) message-level polarity classification, and (iv) discourse-aware sentiment analysis. In the first task, I compare three popular groups of lexicon generation methods: dictionary-, corpus-, and word-embedding-based ones, finding that dictionary-based systems generally yield better polarity lists than the last two groups. Apart from this, I propose a linear projection algorithm, whose results surpass many existing automatically-generated lexicons. Afterwords, in the second task, I examine two common approaches to automatic prediction of sentiment spans, their sources, and targets: conditional random fields (CRFs) and recurrent neural networks, obtaining higher scores with the former model and improving these results even further by redefining the structure of CRF graphs. When dealing with message-level polarity classification, I juxtapose three major sentiment paradigms: lexicon-, machine-learning-, and deep-learning-based systems, and try to unite the first and last of these method groups by introducing a bidirectional neural network with lexicon-based attention. Finally, in order to make the new classifier aware of microblogs' discourse structure, I let it separately analyze the elementary discourse units of each tweet and infer the overall polarity of a message from the scores of its EDUs with the help of two new approaches: latent-marginalized CRFs and Recursive Dirichlet Process.}, language = {en} } @phdthesis{Che2017, author = {Che, Xiaoyin}, title = {E-lecture material enhancement based on automatic multimedia analysis}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-408224}, school = {Universit{\"a}t Potsdam}, pages = {xviii, 148}, year = {2017}, abstract = {In this era of high-speed informatization and globalization, online education is no longer an exquisite concept in the ivory tower, but a rapidly developing industry closely relevant to people's daily lives. Numerous lectures are recorded in form of multimedia data, uploaded to the Internet and made publicly accessible from anywhere in this world. These lectures are generally addressed as e-lectures. In recent year, a new popular form of e-lectures, the Massive Open Online Courses (MOOCs), boosts the growth of online education industry and somehow turns "learning online" into a fashion. As an e-learning provider, besides to keep improving the quality of e-lecture content, to provide better learning environment for online learners is also a highly important task. This task can be preceded in various ways, and one of them is to enhance and upgrade the learning materials provided: e-lectures could be more than videos. Moreover, this process of enhancement or upgrading should be done automatically, without giving extra burdens to the lecturers or teaching teams, and this is the aim of this thesis. The first part of this thesis is an integrated framework of multi-lingual subtitles production, which can help online learners penetrate the language barrier. The framework consists of Automatic Speech Recognition (ASR), Sentence Boundary Detection (SBD) and Machine Translation (MT), among which the proposed SBD solution is major technical contribution, building on Deep Neural Network (DNN) and Word Vector (WV) and achieving state-of-the-art performance. Besides, a quantitative evaluation with dozens of volunteers is also introduced to measure how these auto-generated subtitles could actually help in context of e-lectures. Secondly, a technical solution "TOG" (Tree-Structure Outline Generation) is proposed to extract textual content from the displaying slides recorded in video and re-organize them into a hierarchical lecture outline, which may serve in multiple functions, such like preview, navigation and retrieval. TOG runs adaptively and can be roughly divided into intra-slide and inter-slides phases. Table detection and lecture video segmentation can be implemented as sub- or post-application in these two phases respectively. Evaluation on diverse e-lectures shows that all the outlines, tables and segments achieved are trustworthily accurate. Based on the subtitles and outlines previously created, lecture videos can be further split into sentence units and slide-based segment units. A lecture highlighting process is further applied on these units, in order to capture and mark the most important parts within the corresponding lecture, just as what people do with a pen when reading paper books. Sentence-level highlighting depends on the acoustic analysis on the audio track, while segment-level highlighting focuses on exploring clues from the statistical information of related transcripts and slide content. Both objective and subjective evaluations prove that the proposed lecture highlighting solution is with decent precision and welcomed by users. All above enhanced e-lecture materials have been already implemented in actual use or made available for implementation by convenient interfaces.}, language = {en} } @article{WulffBuschhueterWestphaletal.2022, author = {Wulff, Peter and Buschh{\"u}ter, David and Westphal, Andrea and Mientus, Lukas and Nowak, Anna and Borowski, Andreas}, title = {Bridging the gap between qualitative and quantitative assessment in science education research with machine learning}, series = {Journal of science education and technology}, volume = {31}, journal = {Journal of science education and technology}, number = {4}, publisher = {Springer}, address = {Dordrecht}, issn = {1059-0145}, doi = {10.1007/s10956-022-09969-w}, pages = {490 -- 513}, year = {2022}, abstract = {Science education researchers typically face a trade-off between more quantitatively oriented confirmatory testing of hypotheses, or more qualitatively oriented exploration of novel hypotheses. More recently, open-ended, constructed response items were used to combine both approaches and advance assessment of complex science-related skills and competencies. For example, research in assessing science teachers' noticing and attention to classroom events benefitted from more open-ended response formats because teachers can present their own accounts. Then, open-ended responses are typically analyzed with some form of content analysis. However, language is noisy, ambiguous, and unsegmented and thus open-ended, constructed responses are complex to analyze. Uncovering patterns in these responses would benefit from more principled and systematic analysis tools. Consequently, computer-based methods with the help of machine learning and natural language processing were argued to be promising means to enhance assessment of noticing skills with constructed response formats. In particular, pretrained language models recently advanced the study of linguistic phenomena and thus could well advance assessment of complex constructs through constructed response items. This study examines potentials and challenges of a pretrained language model-based clustering approach to assess preservice physics teachers' attention to classroom events as elicited through open-ended written descriptions. It was examined to what extent the clustering approach could identify meaningful patterns in the constructed responses, and in what ways textual organization of the responses could be analyzed with the clusters. Preservice physics teachers (N = 75) were instructed to describe a standardized, video-recorded teaching situation in physics. The clustering approach was used to group related sentences. Results indicate that the pretrained language model-based clustering approach yields well-interpretable, specific, and robust clusters, which could be mapped to physics-specific and more general contents. Furthermore, the clusters facilitate advanced analysis of the textual organization of the constructed responses. Hence, we argue that machine learning and natural language processing provide science education researchers means to combine exploratory capabilities of qualitative research methods with the systematicity of quantitative methods.}, language = {en} }