@article{WangYangMeinel2016, author = {Wang, Cheng and Yang, Haojin and Meinel, Christoph}, title = {A deep semantic framework for multimodal representation learning}, series = {Multimedia tools and applications : an international journal}, volume = {75}, journal = {Multimedia tools and applications : an international journal}, publisher = {Springer}, address = {Dordrecht}, issn = {1380-7501}, doi = {10.1007/s11042-016-3380-8}, pages = {9255 -- 9276}, year = {2016}, abstract = {Multimodal representation learning has gained increasing importance in various real-world multimedia applications. Most previous approaches focused on exploring inter-modal correlation by learning a common or intermediate space in a conventional way, e.g. Canonical Correlation Analysis (CCA). These works neglected the exploration of fusing multiple modalities at higher semantic level. In this paper, inspired by the success of deep networks in multimedia computing, we propose a novel unified deep neural framework for multimodal representation learning. To capture the high-level semantic correlations across modalities, we adopted deep learning feature as image representation and topic feature as text representation respectively. In joint model learning, a 5-layer neural network is designed and enforced with a supervised pre-training in the first 3 layers for intra-modal regularization. The extensive experiments on benchmark Wikipedia and MIR Flickr 25K datasets show that our approach achieves state-of-the-art results compare to both shallow and deep models in multimodal and cross-modal retrieval.}, language = {en} }