@article{WangYangMeinel2018, author = {Wang, Cheng and Yang, Haojin and Meinel, Christoph}, title = {Image Captioning with Deep Bidirectional LSTMs and Multi-Task Learning}, series = {ACM transactions on multimedia computing, communications, and applications}, volume = {14}, journal = {ACM transactions on multimedia computing, communications, and applications}, number = {2}, publisher = {Association for Computing Machinery}, address = {New York}, issn = {1551-6857}, doi = {10.1145/3115432}, pages = {20}, year = {2018}, abstract = {Generating a novel and descriptive caption of an image is drawing increasing interests in computer vision, natural language processing, and multimedia communities. In this work, we propose an end-to-end trainable deep bidirectional LSTM (Bi-LSTM (Long Short-Term Memory)) model to address the problem. By combining a deep convolutional neural network (CNN) and two separate LSTM networks, our model is capable of learning long-term visual-language interactions by making use of history and future context information at high-level semantic space. We also explore deep multimodal bidirectional models, in which we increase the depth of nonlinearity transition in different ways to learn hierarchical visual-language embeddings. Data augmentation techniques such as multi-crop, multi-scale, and vertical mirror are proposed to prevent over-fitting in training deep models. To understand how our models "translate" image to sentence, we visualize and qualitatively analyze the evolution of Bi-LSTM internal states over time. The effectiveness and generality of proposed models are evaluated on four benchmark datasets: Flickr8K, Flickr30K, MSCOCO, and Pascal1K datasets. We demonstrate that Bi-LSTM models achieve highly competitive performance on both caption generation and image-sentence retrieval even without integrating an additional mechanism (e.g., object detection, attention model). Our experiments also prove that multi-task learning is beneficial to increase model generality and gain performance. We also demonstrate the performance of transfer learning of the Bi-LSTM model significantly outperforms previous methods on the Pascal1K dataset.}, language = {en} } @article{WangYangMeinel2016, author = {Wang, Cheng and Yang, Haojin and Meinel, Christoph}, title = {A deep semantic framework for multimodal representation learning}, series = {Multimedia tools and applications : an international journal}, volume = {75}, journal = {Multimedia tools and applications : an international journal}, publisher = {Springer}, address = {Dordrecht}, issn = {1380-7501}, doi = {10.1007/s11042-016-3380-8}, pages = {9255 -- 9276}, year = {2016}, abstract = {Multimodal representation learning has gained increasing importance in various real-world multimedia applications. Most previous approaches focused on exploring inter-modal correlation by learning a common or intermediate space in a conventional way, e.g. Canonical Correlation Analysis (CCA). These works neglected the exploration of fusing multiple modalities at higher semantic level. In this paper, inspired by the success of deep networks in multimedia computing, we propose a novel unified deep neural framework for multimodal representation learning. To capture the high-level semantic correlations across modalities, we adopted deep learning feature as image representation and topic feature as text representation respectively. In joint model learning, a 5-layer neural network is designed and enforced with a supervised pre-training in the first 3 layers for intra-modal regularization. The extensive experiments on benchmark Wikipedia and MIR Flickr 25K datasets show that our approach achieves state-of-the-art results compare to both shallow and deep models in multimodal and cross-modal retrieval.}, language = {en} } @phdthesis{Wang2016, author = {Wang, Cheng}, title = {Deep Learning of Multimodal Representations}, school = {Universit{\"a}t Potsdam}, pages = {142}, year = {2016}, language = {en} }