@article{WulffMientusNowaketal.2022,
  author    = {Wulff, Peter and Mientus, Lukas and Nowak, Anna and Borowski, Andreas},
  title     = {Utilizing a pretrained language model (BERT) to classify preservice physics teachers' written reflections},
  series = {International journal of artificial intelligence in education},
  journal   = {International journal of artificial intelligence in education},
  number    = {33},
  publisher = {Springer},
  address   = {New York},
  issn      = {1560-4292},
  doi       = {10.1007/s40593-022-00290-6},
  pages     = {439 -- 466},
  year      = {2022},
  abstract  = {Computer-based analysis of preservice teachers' written reflections could enable educational scholars to design personalized and scalable intervention measures to support reflective writing. Algorithms and technologies in the domain of research related to artificial intelligence have been found to be useful in many tasks related to reflective writing analytics such as classification of text segments. However, mostly shallow learning algorithms have been employed so far. This study explores to what extent deep learning approaches can improve classification performance for segments of written reflections. To do so, a pretrained language model (BERT) was utilized to classify segments of preservice physics teachers' written reflections according to elements in a reflection-supporting model. Since BERT has been found to advance performance in many tasks, it was hypothesized to enhance classification performance for written reflections as well. We also compared the performance of BERT with other deep learning architectures and examined conditions for best performance. We found that BERT outperformed the other deep learning architectures and previously reported performances with shallow learning algorithms for classification of segments of reflective writing. BERT starts to outperform the other models when trained on about 20 to 30\% of the training data. Furthermore, attribution analyses for inputs yielded insights into important features for BERT's classification decisions. Our study indicates that pretrained language models such as BERT can boost performance for language-related tasks in educational contexts such as classification.},
  language  = {en}
}
@article{AyzelHeistermann2021,
  author    = {Ayzel, Georgy and Heistermann, Maik},
  title     = {The effect of calibration data length on the performance of a conceptual hydrological model versus LSTM and GRU},
  series = {Computers \& geosciences : an international journal devoted to the publication of papers on all aspects of geocomputation and to the distribution of computer programs and test data sets ; an official journal of the International Association for Mathematical Geology},
  volume    = {149},
  journal   = {Computers \& geosciences : an international journal devoted to the publication of papers on all aspects of geocomputation and to the distribution of computer programs and test data sets ; an official journal of the International Association for Mathematical Geology},
  publisher = {Elsevier},
  address   = {Amsterdam},
  issn      = {0098-3004},
  doi       = {10.1016/j.cageo.2021.104708},
  pages     = {12},
  year      = {2021},
  abstract  = {We systematically explore the effect of calibration data length on the performance of a conceptual hydrological model, GR4H, in comparison to two Artificial Neural Network (ANN) architectures: Long Short-Term Memory Networks (LSTM) and Gated Recurrent Units (GRU), which have just recently been introduced to the field of hydrology. We implemented a case study for six river basins across the contiguous United States, with 25 years of meteorological and discharge data. Nine years were reserved for independent validation; two years were used as a warm-up period, one year for each of the calibration and validation periods, respectively; from the remaining 14 years, we sampled increasing amounts of data for model calibration, and found pronounced differences in model performance. While GR4H required less data to converge, LSTM and GRU caught up at a remarkable rate, considering their number of parameters. Also, LSTM and GRU exhibited the higher calibration instability in comparison to GR4H. These findings confirm the potential of modern deep-learning architectures in rainfall runoff modelling, but also highlight the noticeable differences between them in regard to the effect of calibration data length.},
  language  = {en}
}
@article{WangYangMeinel2018,
  author    = {Wang, Cheng and Yang, Haojin and Meinel, Christoph},
  title     = {Image Captioning with Deep Bidirectional LSTMs and Multi-Task Learning},
  series = {ACM transactions on multimedia computing, communications, and applications},
  volume    = {14},
  journal   = {ACM transactions on multimedia computing, communications, and applications},
  number    = {2},
  publisher = {Association for Computing Machinery},
  address   = {New York},
  issn      = {1551-6857},
  doi       = {10.1145/3115432},
  pages     = {20},
  year      = {2018},
  abstract  = {Generating a novel and descriptive caption of an image is drawing increasing interests in computer vision, natural language processing, and multimedia communities. In this work, we propose an end-to-end trainable deep bidirectional LSTM (Bi-LSTM (Long Short-Term Memory)) model to address the problem. By combining a deep convolutional neural network (CNN) and two separate LSTM networks, our model is capable of learning long-term visual-language interactions by making use of history and future context information at high-level semantic space. We also explore deep multimodal bidirectional models, in which we increase the depth of nonlinearity transition in different ways to learn hierarchical visual-language embeddings. Data augmentation techniques such as multi-crop, multi-scale, and vertical mirror are proposed to prevent over-fitting in training deep models. To understand how our models "translate" image to sentence, we visualize and qualitatively analyze the evolution of Bi-LSTM internal states over time. The effectiveness and generality of proposed models are evaluated on four benchmark datasets: Flickr8K, Flickr30K, MSCOCO, and Pascal1K datasets. We demonstrate that Bi-LSTM models achieve highly competitive performance on both caption generation and image-sentence retrieval even without integrating an additional mechanism (e.g., object detection, attention model). Our experiments also prove that multi-task learning is beneficial to increase model generality and gain performance. We also demonstrate the performance of transfer learning of the Bi-LSTM model significantly outperforms previous methods on the Pascal1K dataset.},
  language  = {en}
}
@article{RischKrestel2019,
  author    = {Risch, Julian and Krestel, Ralf},
  title     = {Domain-specific word embeddings for patent classification},
  series = {Data Technologies and Applications},
  volume    = {53},
  journal   = {Data Technologies and Applications},
  number    = {1},
  publisher = {Emerald Group Publishing Limited},
  address   = {Bingley},
  issn      = {2514-9288},
  doi       = {10.1108/DTA-01-2019-0002},
  pages     = {108 -- 122},
  year      = {2019},
  abstract  = {Purpose Patent offices and other stakeholders in the patent domain need to classify patent applications according to a standardized classification scheme. The purpose of this paper is to examine the novelty of an application it can then be compared to previously granted patents in the same class. Automatic classification would be highly beneficial, because of the large volume of patents and the domain-specific knowledge needed to accomplish this costly manual task. However, a challenge for the automation is patent-specific language use, such as special vocabulary and phrases. Design/methodology/approach To account for this language use, the authors present domain-specific pre-trained word embeddings for the patent domain. The authors train the model on a very large data set of more than 5m patents and evaluate it at the task of patent classification. To this end, the authors propose a deep learning approach based on gated recurrent units for automatic patent classification built on the trained word embeddings. Findings Experiments on a standardized evaluation data set show that the approach increases average precision for patent classification by 17 percent compared to state-of-the-art approaches. In this paper, the authors further investigate the model's strengths and weaknesses. An extensive error analysis reveals that the learned embeddings indeed mirror patent-specific language use. The imbalanced training data and underrepresented classes are the most difficult remaining challenge. Originality/value The proposed approach fulfills the need for domain-specific word embeddings for downstream tasks in the patent domain, such as patent classification or patent analysis.},
  language  = {en}
}
@article{AbdelwahabLandwehr2022,
  author    = {Abdelwahab, Ahmed and Landwehr, Niels},
  title     = {Deep Distributional Sequence Embeddings Based on a Wasserstein Loss},
  series = {Neural processing letters},
  journal   = {Neural processing letters},
  publisher = {Springer},
  address   = {Dordrecht},
  issn      = {1370-4621},
  doi       = {10.1007/s11063-022-10784-y},
  pages     = {21},
  year      = {2022},
  abstract  = {Deep metric learning employs deep neural networks to embed instances into a metric space such that distances between instances of the same class are small and distances between instances from different classes are large. In most existing deep metric learning techniques, the embedding of an instance is given by a feature vector produced by a deep neural network and Euclidean distance or cosine similarity defines distances between these vectors. This paper studies deep distributional embeddings of sequences, where the embedding of a sequence is given by the distribution of learned deep features across the sequence. The motivation for this is to better capture statistical information about the distribution of patterns within the sequence in the embedding. When embeddings are distributions rather than vectors, measuring distances between embeddings involves comparing their respective distributions. The paper therefore proposes a distance metric based on Wasserstein distances between the distributions and a corresponding loss function for metric learning, which leads to a novel end-to-end trainable embedding model. We empirically observe that distributional embeddings outperform standard vector embeddings and that training with the proposed Wasserstein metric outperforms training with other distance functions.},
  language  = {en}
}
@article{ShilonKrausBuecheleetal.2018,
  author    = {Shilon, I. and Kraus, M. and B{\"u}chele, M. and Egberts, Kathrin and Fischer, Tobias and Holch, Tim Lukas and Lohse, T. and Schwanke, U. and Steppa, Constantin Beverly and Funk, Stefan},
  title     = {Application of deep learning methods to analysis of imaging atmospheric Cherenkov telescopes data},
  series = {Astroparticle physics},
  volume    = {105},
  journal   = {Astroparticle physics},
  publisher = {Elsevier},
  address   = {Amsterdam},
  issn      = {0927-6505},
  doi       = {10.1016/j.astropartphys.2018.10.003},
  pages     = {44 -- 53},
  year      = {2018},
  abstract  = {Ground based gamma-ray observations with Imaging Atmospheric Cherenkov Telescopes (IACTs) play a significant role in the discovery of very high energy (E > 100 GeV) gamma-ray emitters. The analysis of IACT data demands a highly efficient background rejection technique, as well as methods to accurately determine the position of its source in the sky and the energy of the recorded gamma-ray. We present results for background rejection and signal direction reconstruction from first studies of a novel data analysis scheme for IACT measurements. The new analysis is based on a set of Convolutional Neural Networks (CNNs) applied to images from the four H.E.S.S. phase-I telescopes. As the H.E.S.S. cameras pixels are arranged in a hexagonal array, we demonstrate two ways to use such image data to train CNNs: by resampling the images to a square grid and by applying modified convolution kernels that conserve the hexagonal grid properties. The networks were trained on sets of Monte-Carlo simulated events and tested on both simulations and measured data from the H.E.S.S. array. A comparison between the CNN analysis to current state-of-the-art algorithms reveals a clear improvement in background rejection performance. When applied to H.E.S.S. observation data, the CNN direction reconstruction performs at a similar level as traditional methods. These results serve as a proof-of-concept for the application of CNNs to the analysis of events recorded by IACTs. (C) 2018 Published by Elsevier B.V.},
  language  = {en}
}