@misc{GeirhosTemmeRauberetal.2018, author = {Geirhos, Robert and Temme, Carlos R. Medina and Rauber, Jonas and Sch{\"u}tt, Heiko Herbert and Bethge, Matthias and Wichmann, Felix A.}, title = {Generalisation in humans and deep neural networks}, series = {Proceedings of the 32nd International Conference on Neural Information Processing Systems}, volume = {31}, journal = {Proceedings of the 32nd International Conference on Neural Information Processing Systems}, publisher = {Curran Associates Inc.}, address = {Red Hook}, issn = {1049-5258}, pages = {7549 -- 7561}, year = {2018}, abstract = {We compare the robustness of humans and current convolutional deep neural networks (DNNs) on object recognition under twelve different types of image degradations. First, using three well known DNNs (ResNet-152, VGG-19, GoogLeNet) we find the human visual system to be more robust to nearly all of the tested image manipulations, and we observe progressively diverging classification error-patterns between humans and DNNs when the signal gets weaker. Secondly, we show that DNNs trained directly on distorted images consistently surpass human performance on the exact distortion types they were trained on, yet they display extremely poor generalisation abilities when tested on other distortion types. For example, training on salt-and-pepper noise does not imply robustness on uniform white noise and vice versa. Thus, changes in the noise distribution between training and testing constitutes a crucial challenge to deep learning vision systems that can be systematically addressed in a lifelong machine learning approach. Our new dataset consisting of 83K carefully measured human psychophysical trials provide a useful reference for lifelong robustness against image degradations set by the human visual system.}, language = {en} } @misc{SchuettRothkegelTrukenbrodetal.2019, author = {Sch{\"u}tt, Heiko Herbert and Rothkegel, Lars Oliver Martin and Trukenbrod, Hans Arne and Engbert, Ralf and Wichmann, Felix A.}, title = {Predicting fixation densities over time from early visual processing}, series = {Perception}, volume = {48}, journal = {Perception}, publisher = {Sage Publ.}, address = {London}, issn = {0301-0066}, pages = {64 -- 65}, year = {2019}, abstract = {Bottom-up saliency is often cited as a factor driving the choice of fixation locations of human observers, based on the (partial) success of saliency models to predict fixation densities in free viewing. However, these observations are only weak evidence for a causal role of bottom-up saliency in natural viewing behaviour. To test bottom-up saliency more directly, we analyse the performance of a number of saliency models---including our own saliency model based on our recently published model of early visual processing (Sch{\"u}tt \& Wichmann, 2017, JoV)---as well as the theoretical limits for predictions over time. On free viewing data our model performs better than classical bottom-up saliency models, but worse than the current deep learning based saliency models incorporating higher-level information like knowledge about objects. However, on search data all saliency models perform worse than the optimal image independent prediction. We observe that the fixation density in free viewing is not stationary over time, but changes over the course of a trial. It starts with a pronounced central fixation bias on the first chosen fixation, which is nonetheless influenced by image content. Starting with the 2nd to 3rd fixation, the fixation density is already well predicted by later densities, but more concentrated. From there the fixation distribution broadens until it reaches a stationary distribution around the 10th fixation. Taken together these observations argue against bottom-up saliency as a mechanistic explanation for eye movement control after the initial orienting reaction in the first one to two saccades, although we confirm the predictive value of early visual representations for fixation locations. The fixation distribution is, first, not well described by any stationary density, second, is predicted better when including object information and, third, is badly predicted by any saliency model in a search task.}, language = {en} }