@misc{BartzYangMeinel2018,
  author    = {Bartz, Christian and Yang, Haojin and Meinel, Christoph},
  title     = {SEE: Towards semi-supervised end-to-end scene text recognition},
  series = {Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence, Thirtieth Innovative Applications of Artificial Intelligence Conference, Eight Symposium on Educational Advances in Artificial Intelligence},
  volume    = {10},
  journal   = {Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence, Thirtieth Innovative Applications of Artificial Intelligence Conference, Eight Symposium on Educational Advances in Artificial Intelligence},
  publisher = {ASSOC Association for the Advancement of Artificial Intelligence},
  address   = {Palo Alto},
  isbn      = {978-1-57735-800-8},
  pages     = {6674 -- 6681},
  year      = {2018},
  abstract  = {Detecting and recognizing text in natural scene images is a challenging, yet not completely solved task. In recent years several new systems that try to solve at least one of the two sub-tasks (text detection and text recognition) have been proposed. In this paper we present SEE, a step towards semi-supervised neural networks for scene text detection and recognition, that can be optimized end-to-end. Most existing works consist of multiple deep neural networks and several pre-processing steps. In contrast to this, we propose to use a single deep neural network, that learns to detect and recognize text from natural images, in a semi-supervised way. SEE is a network that integrates and jointly learns a spatial transformer network, which can learn to detect text regions in an image, and a text recognition network that takes the identified text regions and recognizes their textual content. We introduce the idea behind our novel approach and show its feasibility, by performing a range of experiments on standard benchmark datasets, where we achieve competitive results.},
  language  = {en}
}
@misc{BartzYangBethgeetal.2019,
  author    = {Bartz, Christian and Yang, Haojin and Bethge, Joseph and Meinel, Christoph},
  title     = {LoANs},
  series = {Computer Vision - ACCV 2018 Workshops},
  volume    = {11367},
  journal   = {Computer Vision - ACCV 2018 Workshops},
  publisher = {Springer},
  address   = {Cham},
  isbn      = {978-3-030-21074-8},
  issn      = {0302-9743},
  doi       = {10.1007/978-3-030-21074-8_29},
  pages     = {341 -- 356},
  year      = {2019},
  abstract  = {Recently, deep neural networks have achieved remarkable performance on the task of object detection and recognition. The reason for this success is mainly grounded in the availability of large scale, fully annotated datasets, but the creation of such a dataset is a complicated and costly task. In this paper, we propose a novel method for weakly supervised object detection that simplifies the process of gathering data for training an object detector. We train an ensemble of two models that work together in a student-teacher fashion. Our student (localizer) is a model that learns to localize an object, the teacher (assessor) assesses the quality of the localization and provides feedback to the student. The student uses this feedback to learn how to localize objects and is thus entirely supervised by the teacher, as we are using no labels for training the localizer. In our experiments, we show that our model is very robust to noise and reaches competitive performance compared to a state-of-the-art fully supervised approach. We also show the simplicity of creating a new dataset, based on a few videos (e.g. downloaded from YouTube) and artificially generated data.},
  language  = {en}
}
@book{BartzKrestel2021,
  author    = {Bartz, Christian and Krestel, Ralf},
  title     = {Deep learning for computer vision in the art domain},
  number    = {139},
  publisher = {Universit{\"a}tsverlag Potsdam},
  address   = {Potsdam},
  isbn      = {978-3-86956-514-9},
  issn      = {1613-5652},
  doi       = {10.25932/publishup-51290},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-512906},
  publisher      = {Universit{\"a}t Potsdam},
  pages     = {vii, 79},
  year      = {2021},
  abstract  = {In recent years, computer vision algorithms based on machine learning have seen rapid development. In the past, research mostly focused on solving computer vision problems such as image classification or object detection on images displaying natural scenes. Nowadays other fields such as the field of cultural heritage, where an abundance of data is available, also get into the focus of research. In the line of current research endeavours, we collaborated with the Getty Research Institute which provided us with a challenging dataset, containing images of paintings and drawings. In this technical report, we present the results of the seminar "Deep Learning for Computer Vision". In this seminar, students of the Hasso Plattner Institute evaluated state-of-the-art approaches for image classification, object detection and image recognition on the dataset of the Getty Research Institute. The main challenge when applying modern computer vision methods to the available data is the availability of annotated training data, as the dataset provided by the Getty Research Institute does not contain a sufficient amount of annotated samples for the training of deep neural networks. However, throughout the report we show that it is possible to achieve satisfying to very good results, when using further publicly available datasets, such as the WikiArt dataset, for the training of machine learning models.},
  language  = {en}
}
@phdthesis{Bartz2022,
  author    = {Bartz, Christian},
  title     = {Reducing the annotation burden: deep learning for optical character recognition using less manual annotations},
  doi       = {10.25932/publishup-55540},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-555407},
  school      = {Universit{\"a}t Potsdam},
  pages     = {xxiv, 183},
  year      = {2022},
  abstract  = {Text is a ubiquitous entity in our world and daily life. We encounter it nearly everywhere in shops, on the street, or in our flats. Nowadays, more and more text is contained in digital images. These images are either taken using cameras, e.g., smartphone cameras, or taken using scanning devices such as document scanners. The sheer amount of available data, e.g., millions of images taken by Google Streetview, prohibits manual analysis and metadata extraction. Although much progress was made in the area of optical character recognition (OCR) for printed text in documents, broad areas of OCR are still not fully explored and hold many research challenges. With the mainstream usage of machine learning and especially deep learning, one of the most pressing problems is the availability and acquisition of annotated ground truth for the training of machine learning models because obtaining annotated training data using manual annotation mechanisms is time-consuming and costly. In this thesis, we address of how we can reduce the costs of acquiring ground truth annotations for the application of state-of-the-art machine learning methods to optical character recognition pipelines. To this end, we investigate how we can reduce the annotation cost by using only a fraction of the typically required ground truth annotations, e.g., for scene text recognition systems. We also investigate how we can use synthetic data to reduce the need of manual annotation work, e.g., in the area of document analysis for archival material. In the area of scene text recognition, we have developed a novel end-to-end scene text recognition system that can be trained using inexact supervision and shows competitive/state-of-the-art performance on standard benchmark datasets for scene text recognition. Our method consists of two independent neural networks, combined using spatial transformer networks. Both networks learn together to perform text localization and text recognition at the same time while only using annotations for the recognition task. We apply our model to end-to-end scene text recognition (meaning localization and recognition of words) and pure scene text recognition without any changes in the network architecture. In the second part of this thesis, we introduce novel approaches for using and generating synthetic data to analyze handwriting in archival data. First, we propose a novel preprocessing method to determine whether a given document page contains any handwriting. We propose a novel data synthesis strategy to train a classification model and show that our data synthesis strategy is viable by evaluating the trained model on real images from an archive. Second, we introduce the new analysis task of handwriting classification. Handwriting classification entails classifying a given handwritten word image into classes such as date, word, or number. Such an analysis step allows us to select the best fitting recognition model for subsequent text recognition; it also allows us to reason about the semantic content of a given document page without the need for fine-grained text recognition and further analysis steps, such as Named Entity Recognition. We show that our proposed approaches work well when trained on synthetic data. Further, we propose a flexible metric learning approach to allow zero-shot classification of classes unseen during the network's training. Last, we propose a novel data synthesis algorithm to train off-the-shelf pixel-wise semantic segmentation networks for documents. Our data synthesis pipeline is based on the famous Style-GAN architecture and can synthesize realistic document images with their corresponding segmentation annotation without the need for any annotated data!},
  language  = {en}
}