@conference {186, title = {Learning only a handful of latent variables produces neural-aligned CNN models of the ventral stream}, booktitle = {Computational and Systems Neuroscience (COSYNE) }, year = {2024}, month = {02.2024}, publisher = {Computational and Systems Neuroscience (COSYNE)}, organization = {Computational and Systems Neuroscience (COSYNE)}, address = {Lisbon, Portugal}, abstract = {

Image-computable modeling of primate ventral stream visual processing has made great strides via brainmapped versions of convolutional neural networks (CNNs) that are optimized on thousands of object categories (ImageNet), the performance of which strongly predicts CNNs\’ neural alignment. However, human and primate visual intelligence extends far beyond object categorization, encompassing a diverse range of tasks, such as estimating the latent variables of object position or pose in the image. The influence of task choice on neural alignment in CNNs, compared to CNN architecture, remains underexplored, partly due to the scarcity of largescale datasets with rich known labels beyond categories. 3D graphic engines, capable of creating training images with detailed information on various latent variables, offer a solution. Here, we asked how the choice of visual tasks that are used to train CNNs (i.e., the set of latent variables to be estimated) affects their ventral stream neural alignment. We focused on the estimation of variables such as object position and pose, and we tested CNNs\’ neural alignment via the Brain-Score open science platform. We found some of these CNNs had neural alignment scores that were very close to those trained on ImageNet, even though their entire training experience has been on synthetic images. Additionally, we found training models on just a handful of latent variables achieved the same level of neural alignment as models trained on a much larger number of categories, suggesting that latent variable training is more efficient than category training in driving model-neural alignment. Moreover, we found that these models\’ neural alignment scores scale with the amount of synthetic data used during training, suggesting the potential of obtaining more aligned models with larger synthetic datasets. This study highlights the effectiveness of using synthetic datasets and latent variables in advancing image-computable models of the ventral visual stream.

}, doi = {https://hdl.handle.net/1721.1/153744}, author = {Xie, Yudi and Alter, Esther and Schwartz, Jeremy and DiCarlo, James J} } @article {175, title = {Robustified ANNs Reveal Wormholes Between Human Category Percepts}, journal = {arXiv}, year = {2023}, month = {08/2023}, type = {preprint}, abstract = {

The visual object category reports of artificial neural networks (ANNs) are notoriously sensitive to tiny, adversarial image perturbations. Because human category reports (aka human percepts) are thought to be insensitive to those same small-norm perturbations -- and locally stable in general -- this argues that ANNs are incomplete scientific models of human visual perception. Consistent with this, we show that when small-norm image perturbations are generated by standard ANN models, human object category percepts are indeed highly stable. However, in this very same \"human-presumed-stable\" regime, we find that robustified ANNs reliably discover low-norm image perturbations that strongly disrupt human percepts. These previously undetectable human perceptual disruptions are massive in amplitude, approaching the same level of sensitivity seen in robustified ANNs. Further, we show that robustified ANNs support precise perceptual state interventions: they guide the construction of low-norm image perturbations that strongly alter human category percepts toward specific prescribed percepts. These observations suggest that for arbitrary starting points in image space, there exists a set of nearby \"wormholes\", each leading the subject from their current category perceptual state into a semantically very different state. Moreover, contemporary ANN models of biological visual processing are now accurate enough to consistently guide us to those portals.

}, doi = { https://doi.org/10.48550/arXiv.2308.06887 Focus to learn more}, url = {https://arxiv.org/pdf/2308.06887.pdf}, author = {Gaziv, Guy and Lee, Michael J and DiCarlo, James J} } @conference {179, title = {Strong and Precise Modulation of Human Percepts via Robustified ANNs}, booktitle = {Neural Information Processing Systems}, year = {2023}, address = {New Orleans, Louisiana}, abstract = {

The visual object category reports of artificial neural networks (ANNs) are notoriously sensitive to tiny, adversarial image perturbations. Because human category reports (aka human percepts) are thought to be insensitive to those same small-norm perturbations \– and locally stable in general \– this argues that ANNs are incomplete scientific models of human visual perception. Consistent with this, we show that when small-norm image perturbations are generated by standard ANN models, human object category percepts are indeed highly stable. However, in this very same \“human-presumed-stable\” regime, we find that robustified ANNs reliably discover low-norm image perturbations that strongly disrupt human percepts. These previously undetectable human perceptual disruptions are massive in amplitude, approaching the same level of sensitivity seen in robustified ANNs. Further, we show that robustified ANNs support precise perceptual state interventions: they guide the construction of low-norm image perturbations that strongly alter human category percepts toward specific prescribed percepts. In sum, these contemporary models of biological visual processing are now accurate enough to guide strong and precise interventions on human perception.

}, url = {https://openreview.net/pdf?id=5GmTI4LNqX}, author = {Gaziv, Guy and Lee, Michael J and DiCarlo, James J} } @article {177, title = {A Unifying Principle for the Functional Organization of Visual Cortex}, journal = {bioRxiv}, year = {2023}, month = {2023}, abstract = {

A key feature of many cortical systems is functional organization: the arrangement of neurons with specific functional properties in characteristic spatial patterns across the cortical surface. However, the principles\ underlying the emergence and utility of functional organization are poorly understood. Here we develop\ the Topographic Deep Artificial Neural Network (TDANN), the first unified model to accurately predict the\ functional organization of multiple cortical areas in the primate visual system. We analyze the key factorsresponsible for the TDANN\’s success and find that it strikes a balance between two specific objectives:achieving a task-general sensory representation that is self-supervised, and maximizing the smoothness of\ responses across the cortical sheet according to a metric that scales relative to cortical surface area. In\ turn, the representations learned by the TDANN are lower dimensional and more brain-like than those in\ models that lack a spatial smoothness constraint. Finally, we provide evidence that the TDANN\’s functionalorganization balances performance with inter-area connection length, and use the resulting models for\ a proof-of-principle optimization of cortical prosthetic design. Our results thus offer a unified principle\ for understanding functional organization and a novel view of the functional role of the visual system in\ particular.

}, doi = { https://doi.org/10.1101/2023.05.18.541361}, url = {https://www.biorxiv.org/content/10.1101/2023.05.18.541361v1.full.pdf}, author = {Margalit, Eshed and Lee, Hyodong and Finzi, Dawn and DiCarlo, James J and Grill-Spector, Kalanit and Yamins, Daniel LK} } @article {170, title = {Adversarially trained neural representations may already be as robust as corresponding biological neural representations}, journal = {arXiv}, year = {2022}, month = {06/19/2022}, type = {preprint}, abstract = {

Visual systems of primates are the gold standard of robust perception. There is thus a general belief that mimicking the neural representations that underlie those systems will yield artificial visual systems that are adversarially robust. In this work, we develop a method for performing adversarial visual attacks directly on primate brain activity. We then leverage this method to demonstrate that the above-mentioned belief might not be well founded. Specifically, we report that the biological neurons that make up visual systems of primates exhibit susceptibility to adversarial perturbations that is comparable in magnitude to existing (robustly trained) artificial neural networks.

}, doi = {https://doi.org/10.48550/arXiv.2206.11228}, url = {https://arxiv.org/abs/2206.11228v1}, author = {Guo, Chong and Lee, Michael J and Leclerc, Guillaume and Dapello, Joel and Rao, Yug and Madry, Aleksander and DiCarlo, James J} } @conference {173, title = {Primate Inferotemporal Cortex Neurons Generalize Better to Novel Image Distributions Than Analogous Deep Neural Networks Units}, booktitle = {SVHRM Workshop at Neural Information Processing Systems (NeurIPS)}, year = {2022}, month = {2022 }, address = {Lisbon, Portugal}, abstract = {

Humans are successfully able to recognize objects in a variety of image distributions. Today\&$\#$39;s artificial neural networks (ANNs), on the other hand, struggle to recognize objects in many image domains, especially those different from the training distribution. It is currently unclear which parts of the ANNs could be improved in order to close this generalization gap. In this work, we used recordings from primate high-level visual cortex (IT) to isolate whether ANNs lag behind primate generalization capabilities because of their encoder (transformations up to the penultimate layer), or their decoder (linear transformation into class labels). Specifically, we fit a linear decoder on images from one domain and evaluate transfer performance on twelve held-out domains, comparing fitting on primate IT representations vs. representations in ANN penultimate layers. To fairly compare, we scale the number of each ANN\&$\#$39;s units so that its in-domain performance matches that of the sampled IT population (i.e. 71 IT neural sites, 73\% binary-choice accuracy). We find that the sampled primate population achieves, on average, 68\% performance on the held-out-domains. Comparably sampled populations from ANN model units generalize less well, maintaining on average 60\%. This is independent of the number of sampled units: models\&$\#$39; out-of-domain accuracies consistently lag behind primate IT. These results suggest that making ANN model units more like primate IT will improve the generalization performance of ANNs.

}, url = {https://openreview.net/pdf?id=iPF7mhoWkOl}, author = {Bagus, Ayu Marliawaty I Gusti and Marques, Tiago and Sanghavi, Sachi and DiCarlo, James J and Schrimpf, Martin} } @conference {146, title = {Wiring Up Vision: Minimizing Supervised Synaptic Updates Needed to Produce a Primate Ventral Stream}, booktitle = {International Conference on Learning Representations 2022 Spotlight}, year = {2022}, month = {April 25, 2022}, type = {preprint}, abstract = {

After training on large datasets, certain deep neural networks are surprisingly good models of the neural mechanisms of adult primate visual object recognition. Nevertheless, these models are considered poor models of the development of the visual system because they posit millions of sequential, precisely coordinated synaptic updates, each based on a labeled image. While ongoing research is pursuing the use of unsupervised proxies for labels, we here explore a complementary strategy of reducing the required number of supervised synaptic updates to produce an adult-like ventral visual stream (as judged by the match to V1, V2, V4, IT, and behavior). Such models might require less precise machinery and energy expenditure to coordinate these updates and would thus move us closer to viable neuroscientific hypotheses about how the visual system wires itself up. Relative to standard model training on labeled images in ImageNet, we here demonstrate that the total number of supervised weight updates can be substantially reduced using three complementary strategies: First, we find that only 2\% of supervised updates (epochs and images) are needed to achieve \∼80\% of a fully trained model\’s match to adult ventral stream. Specifically, training benefits predictions of higher visual cortex the most whereas predictions of earlier areas improve only marginally over the course of training. Second, by improving the random distribution of synaptic connectivity, we find that 54\% of the brain match can already be achieved \“at birth\” (i.e. no training at all). Third, we find that, by training only \∼5\% of model synapses, we can still achieve nearly 80\% of the match to the ventral stream. This approach further improves on ImageNet performance over previous attempts in computer vision of minimizing trained components without substantially increasing the number of trained parameters. These results reflect first steps in modeling not just primate adult visual processing during inference, but also how the ventral visual stream might be \“wired up\” by evolution (a model\’s \“birth\” state) and by developmental learning (a model\’s updates based on visual experience).

}, keywords = {biologically plausible learning, computational neuroscience, convolutional neural networks, primate visual ventral stream}, doi = {10.1101/2020.06.08.140111}, url = {https://openreview.net/pdf?id=g1SzIRLQXMM}, author = {Geiger, Franziska and Schrimpf, Martin and Marques, Tiago and DiCarlo, James J} } @proceedings {163, title = {Combining Different V1 Brain Model Variants to Improve Robustness to Image Corruptions in CNNs}, journal = {Shared Visual Representations in Human \& Machine Intelligence - NeurIPS Workshop}, year = {2021}, month = {October 20, 2021}, publisher = {Neural Information Processing Systems}, abstract = {

While some convolutional neural networks (CNNs) have surpassed human visual abilities in object classification, they often struggle to recognize objects in images corrupted with different types of common noise patterns, highlighting a major limitation of this family of models. Recently, it has been shown that simulating a primary visual cortex (V1) at the front of CNNs leads to small improvements in robustness to these image perturbations. In this study, we start with the observation that different variants of the V1 model show gains for specific corruption types. We then build a new model using an ensembling technique, which combines multiple individual models with different V1 front-end variants. The model ensemble leverages the strengths of each individual model, leading to significant improvements in robustness across all corruption categories and outperforming the base model by 38\% on average. Finally, we show that using distillation, it is possible to partially compress the knowledge in the ensemble model into a single model with a V1 front-end. While the ensembling and distillation techniques used here are hardly biologically-plausible, the results presented here demonstrate that by combining the specific strengths of different neuronal circuits in V1 it is possible to improve the robustness of CNNs for a wide range of perturbations.

}, url = {https://arxiv.org/abs/2110.10645}, author = {Baidya, Avinash and Dapello, Joel and DiCarlo, James J and Marques, Tiago} } @article {158, title = {The ThreeDWorld Transport Challenge: A Visually Guided Task-and-Motion Planning Benchmark for Physically Realistic Embodied AI}, journal = {arXiv}, year = {2021}, month = {03/25/2021}, type = {preprint}, abstract = {

We introduce a visually-guided and physics-driven taskand-motion planning benchmark, which we call the ThreeDWorld Transport Challenge. In this challenge, an embodied agent equipped with two 9-DOF articulated arms is spawned randomly in a simulated physical home environment. The agent is required to find a small set of objects scattered around the house, pick them up, and transport them to a desired final location. We also position containers around the house that can be used as tools to assist with transporting objects efficiently. To complete the task, an embodied agent must plan a sequence of actions to change the state of a large number of objects in the face of realistic physical constraints. We build this benchmark challenge using the ThreeDWorld simulation: a virtual 3D environment where all objects respond to physics, and where can be controlled using a fully physics-driven navigation and interaction API. We evaluate several existing agents on this benchmark. Experimental results suggest that: 1) a pure RL model struggles on this challenge; 2) hierarchical planningbased agents can transport some objects but still far from solving this task. We anticipate that this benchmark will empower researchers to develop more intelligent physicsdriven robots for the physical world.\ 

}, doi = {arXiv:2103.14025}, url = {https://arxiv.org/abs/2103.14025}, author = {Gan, Chuang and Zhou, Siyuan and Schwartz, Jeremy and Alter, Seth and Bhandwaldar, Abhishek and Gutfreund, Dan and Yamins, Daniel LK and DiCarlo, James J and McDermott, Josh and Torralba, Antonio} } @article {160, title = {Unsupervised changes in core object recognition behavior are predicted by neural plasticity in inferior temporal cortex}, journal = {eLife}, volume = {10}, year = {2021}, month = {Nov-06-2021}, abstract = {

Temporal continuity of object identity is a feature of natural visual input, and is potentially exploited -- in an unsupervised manner -- by the ventral visual stream to build the neural representation in inferior temporal (IT) cortex. Here we investigated whether plasticity of individual IT neurons underlies human core-object-recognition behavioral changes induced with unsupervised visual experience. We built a single-neuron plasticity model combined with a previously established IT population-to-recognition-behavior linking model to predict human learning effects. We found that our model, after constrained by neurophysiological data, largely predicted the mean direction, magnitude and time course of human performance changes. We also found a previously unreported dependency of the observed human performance change on the initial task difficulty. This result adds support to the hypothesis that tolerant core object recognition in human and non-human primates is instructed -- at least in part -- by naturally occurring unsupervised temporal contiguity experience.

}, doi = {10.7554/eLife.60830}, url = {https://elifesciences.org/articles/60830https://cdn.elifesciences.org/articles/60830/elife-60830-v2.pdfhttps://cdn.elifesciences.org/articles/60830/elife-60830-v2.xml}, author = {Jia, Xiaoxuan and Hong, Ha and DiCarlo, James J} } @article {151, title = {Simulating a Primary Visual Cortex at the Front of CNNs Improves Robustness to Image Perturbations}, journal = {Neural Information Processing Systems (NeurIPS; spotlight)}, year = {2020}, month = {June 17, 2020}, type = {preprint}, abstract = {

Current state-of-the-art object recognition models are largely based on convolutional neural network (CNN) architectures, which are loosely inspired by the primate visual system. However, these CNNs can be fooled by imperceptibly small, explicitly crafted perturbations, and struggle to recognize objects in corrupted images that are easily recognized by humans. Here, by making comparisons with primate neural data, we first observed that CNN models with a neural hidden layer that better matches primate primary visual cortex (V1) are also more robust to adversarial attacks. Inspired by this observation, we developed VOneNets, a new class of hybrid CNN vision models. Each VOneNet contains a fixed weight neural network front-end that simulates primate V1, called the VOneBlock, followed by a neural network back-end adapted from current CNN vision models. The VOneBlock is based on a classical neuroscientific model of V1: the linear-nonlinear-Poisson model, consisting of a biologically-constrained Gabor filter bank, simple and complex cell nonlinearities, and a V1 neuronal stochasticity generator. After training, VOneNets retain high ImageNet performance, but each is substantially more robust, outperforming the base CNNs and state-of-the-art methods by 18\% and 3\%, respectively, on a conglomerate benchmark of perturbations comprised of white box adversarial attacks and common image corruptions. Finally, we show that all components of the VOneBlock work in synergy to improve robustness. While current CNN architectures are arguably brain-inspired, the results presented here demonstrate that more precisely mimicking just one stage of the primate visual system leads to new gains in ImageNet-level computer vision applications.

}, doi = {10.1101/2020.06.16.154542}, url = {https://www.biorxiv.org/content/10.1101/2020.06.16.154542v27}, author = {Dapello, Joel and Marques, Tiago and Schrimpf, Martin and Geiger, Franziska and Cox, David D and DiCarlo, James J} } @article {168, title = {Comparing novel object learning in humans, models, and monkeys}, journal = {Journal of Vision}, volume = {19}, year = {2019}, month = {Jun-09-2019}, pages = {114b}, abstract = {

Humans readily learn to identify novel objects, and it has been hypothesized that plasticity in visual cortex supports this behavior. Contributing to this view are reports of experience-driven changes in the properties of neurons at many levels of visual cortex, from V1 to inferotemporal cortex (IT). Here, we ask if object learning might instead be explained by a simple model in which a static set of IT-like visual features is followed by a perceptron learner. Specifically, we measured human (268 subjects; 170,000+ trials) and nonhuman primate (NHP; 2 subjects, 300,000+ trials) behavior across a battery of 29 visuomotor association tasks that each required the subject to learn to discriminate between a pair of synthetically-generated, never-before-seen 3D objects (58 distinct objects). Objects were rendered at varying scales, positions, and rotations; superimposed on naturalistic backgrounds; and presented for 200 msec. We then approximated the visual system\’s IT response to each image using models of ventral stream processing (i.e. specific deep neural networks trained on ImageNet categorization), and we applied a reward-based, perceptron learner to the static set of features produced at the penultimate layer of each model. We report that our model is sufficient to explain both human and NHP rates of learning on these tasks. Additionally, we show humans, NHPs, and this model share the same pattern of performance over objects, but that NHPs reach criterion performance ~10\× as slowly as humans (human t = 139, NHP t = 1149), suggesting humans have similar but more rapid learning mechanisms than their NHP cousins in this domain. Taken together, these results suggest the possibility that object learning is mediated by plasticity in a small population of \“readout\” neurons that learn and execute weighted sums of activity across an upstream sensory population representation (IT) that is largely stable.

}, issn = {1534-7362}, doi = {10.1167/19.10.114b}, url = {https://jov.arvojournals.org/article.aspx?articleid=2750359}, author = {Lee, Michael J and DiCarlo, James J} } @conference {166, title = {Using Brain-Score to Evaluate and Build Neural Networks for Brain-Like Object Recognition}, booktitle = {Computational and Systems Neuroscience (COSYNE)}, year = {2019}, address = {Denver, CO}, author = {Schrimpf, Martin and Kubilius, Jonas and Hong, Ha and Majaj, Najib and Rajalingham, Rishi and Issa, Elias B and Kar, Kohitij and Ziemba, Corey M and Bashivan, Pouya and Prescott-Roy, Jonathan and Schmidt, Kailyn and Yamins, Daniel LK and DiCarlo, James J} } @article {15, title = {Neural dynamics at successive stages of the ventral visual stream are consistent with hierarchical error signals.}, journal = {eLife}, volume = {7}, year = {2018}, month = {11/2018}, abstract = {

Ventral visual stream neural responses are dynamic, even for static image presentations. However, dynamical neural models of visual cortex are lacking as most progress has been made modeling static, time-averaged responses. Here, we studied population neural dynamics during face detection across three cortical processing stages. Remarkably,~30 milliseconds after the initially evoked response, we found that neurons in intermediate level areas decreased their responses to typical configurations of their preferred face parts relative to their response for atypical configurations even while neurons in higher areas achieved and maintained a preference for typical configurations. These hierarchical neural dynamics were inconsistent with standard feedforward circuits. Rather, recurrent models computing prediction errors between stages captured the observed temporal signatures. This model of neural dynamics, which simply augments the standard feedforward model of online vision, suggests that neural responses to static images may encode top-down prediction errors in addition to bottom-up feature estimates.

}, keywords = {Animals, Brain Mapping, Face, Humans, Macaca mulatta, Models, Neurological, Neurons, Pattern Recognition, Photic Stimulation, Reaction Time, Visual, Visual Cortex, Visual Perception}, issn = {2050-084X}, doi = {10.7554/eLife.42870}, url = {https://elifesciences.org/articles/42870https://cdn.elifesciences.org/articles/42870/elife-42870-v2.pdf}, author = {Issa, Elias B and Cadieu, Charles F and DiCarlo, James J} } @article {25, title = {Eight open questions in the computational modeling of higher sensory cortex}, journal = {Current Opinion in Neurobiology}, volume = {37}, year = {2016}, month = {01/2016}, pages = {114 - 120}, abstract = {

Propelled by advances in biologically inspired computer vision and artificial intelligence, the past five years have seen significant progress in using deep neural networks to model response patterns of neurons in visual cortex. In this paper, we briefly review this progress and then discuss eight key \&$\#$39;open questions\&$\#$39; that we believe will drive research in computational models of sensory systems over the next five years, both in visual cortex and beyond.

}, issn = {1873-6882}, doi = {10.1016/j.conb.2016.02.001}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438816300022}, author = {Yamins, Daniel LK and DiCarlo, James J} } @article {27, title = {Explicit information for category-orthogonal object properties increases along the ventral stream}, journal = {Nature Neuroscience}, volume = {19}, year = {2016}, month = {02/2016}, pages = {613 - 622}, abstract = {

Extensive research has revealed that the ventral visual stream hierarchically builds a robust representation for supporting visual object categorization tasks. We systematically explored the ability of multiple ventral visual areas to support a variety of \&$\#$39;category-orthogonal\&$\#$39; object properties such as position, size and pose. For complex naturalistic stimuli, we found that the inferior temporal (IT) population encodes all measured category-orthogonal object properties, including those properties often considered to be low-level features (for example, position), more explicitly than earlier ventral stream areas. We also found that the IT population better predicts human performance patterns across properties. A hierarchical neural network model based on simple computational principles generates these same cross-area patterns of information. Taken together, our empirical results support the hypothesis that all behaviorally relevant object properties are extracted in concert up the ventral visual hierarchy, and our computational model explains how that hierarchy might be built.

}, issn = {1097-6256}, doi = {10.1038/nn.4247}, url = {http://www.nature.com/articles/nn.4247}, author = {Hong, Ha and Yamins, Daniel L K and Majaj, Najib J and DiCarlo, James J} } @article {26, title = {Using goal-driven deep learning models to understand sensory cortex}, journal = {Nature Neuroscience}, volume = {19}, year = {2016}, month = {01/2016}, pages = {356 - 365}, abstract = {

Fueled by innovation in the computer vision and artificial intelligence communities, recent developments in computational neuroscience have used goal-driven hierarchical convolutional neural networks (HCNNs) to make strides in modeling neural single-unit and population responses in higher visual cortical areas. In this Perspective, we review the recent progress in a broader modeling context and describe some of the key technical innovations that have supported it. We then outline how the goal-driven HCNN approach can be used to delve even more deeply into understanding the development and organization of sensory cortical processing.

}, issn = {1097-6256}, doi = {10.1038/nn.4244}, url = {http://www.nature.com/articles/nn.4244.pdf}, author = {Yamins, Daniel L K and DiCarlo, James J} } @article {65, title = {Why is Real-World Visual Object Recognition Hard?}, journal = {PLoS Computational Biology}, volume = {4}, year = {2008}, month = {01/2008}, pages = {e27}, abstract = {

Progress in understanding the brain mechanisms underlying vision requires the construction of computational models that not only emulate the brain\&$\#$39;s anatomy and physiology, but ultimately match its performance on visual tasks. In recent years, \"natural\" images have become popular in the study of vision and have been used to show apparently impressive progress in building such models. Here, we challenge the use of uncontrolled \"natural\" images in guiding that progress. In particular, we show that a simple V1-like model\–a neuroscientist\&$\#$39;s \"null\" model, which should perform poorly at real-world visual object recognition tasks\–outperforms state-of-the-art object recognition systems (biologically inspired and otherwise) on a standard, ostensibly natural image recognition test. As a counterpoint, we designed a \"simpler\" recognition test to better span the real-world variation in object pose, position, and scale, and we show that this test correctly exposes the inadequacy of the V1-like model. Taken together, these results demonstrate that tests based on uncontrolled natural images can be seriously misleading, potentially guiding progress in the wrong direction. Instead, we reexamine what it means for images to be natural and argue for a renewed focus on the core problem of object recognition\–real-world image variation.

}, doi = {10.1371/journal.pcbi.0040027}, url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.0040027}, author = {Pinto, Nicolas and Cox, David D and DiCarlo, James J}, editor = {Friston, Karl J} } @article {83, title = {Learning and neural plasticity in visual object recognition}, journal = {Current Opinion in Neurobiology}, volume = {16}, year = {2006}, month = {01/2006}, pages = {152 - 158}, abstract = {

The capability of the adult primate visual system for rapid and accurate recognition of targets in cluttered, natural scenes far surpasses the abilities of state-of-the-art artificial vision systems. Understanding this capability remains a fundamental challenge in visual neuroscience. Recent experimental evidence suggests that adaptive coding strategies facilitated by underlying neural plasticity enable the adult brain to learn from visual experience and shape its ability to integrate and recognize coherent visual objects.

}, issn = {09594388}, doi = {10.1016/j.conb.2006.03.012}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438806000377}, author = {Kourtzi, Zoe and DiCarlo, James J} } @article {90, title = {{\textquoteright}Breaking{\textquoteright} position-invariant object recognition}, journal = {Nature Neuroscience}, volume = {8}, year = {2005}, month = {08/2005}, pages = {1145 - 1147}, abstract = {

While it is often assumed that objects can be recognized irrespective of where they fall on the retina, little is known about the mechanisms underlying this ability. By exposing human subjects to an altered world where some objects systematically changed identity during the transient blindness that accompanies eye movements, we induced predictable object confusions across retinal positions, effectively \&$\#$39;breaking\&$\#$39; position invariance. Thus, position invariance is not a rigid property of vision but is constantly adapting to the statistics of the environment.

}, issn = {1097-6256}, doi = {10.1038/nn1519}, url = {http://www.nature.com/articles/nn1519.pdf}, author = {Cox, David D and Meier, Philip and Oertelt, Nadja and DiCarlo, James J} } @article {100, title = {Receptive field structure in cortical area 3b of the alert monkey}, journal = {Behavioural Brain Research}, volume = {135}, year = {2002}, month = {01/2002}, pages = {167 - 178}, abstract = {

More than 350 neurons with fingerpad receptive fields (RFs) were studied in cortical area 3b of three alert monkeys. Random dot patterns, which contain all stimulus patterns with equal probability, were scanned across these RFs at three velocities and eight directions to reveal the RFs\’ spatial and temporal structure. Area 3b RFs are characterized by three components: (1) a single, central excitatory region of short duration, (2) one or more inhibitory regions, also of short duration, that are adjacent to and nearly synchronous with the excitation, and (3) a region of inhibition that overlaps the excitation partially or totally and is temporally delayed with respect to the first two components. As a result of these properties, RF spatial structure depends on scanning direction but is virtually unaffected by changes in scanning velocity. This RF characterization, which is derived solely from responses to scanned random-dot patterns, predicts a neuron\&$\#$39;s responses to random patterns accurately, as expected, but it also predicts orientation sensitivity and preferred orientation measured with a scanned bar. Both orientation sensitivity and the ratio of coincident inhibition (number 2 above) to excitation are stronger in the supra- and infragranular layers than in layer IV.

}, keywords = {Action Potentials, Afferent, Animals, Brain Mapping, Evoked Potentials, Haplorhini, Models, Neurological, Neurons, Orientation, Reproducibility of Results, Skin, Somatosensory, Somatosensory Cortex}, issn = {01664328}, doi = {10.1016/S0166-4328(02)00162-6}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0166432802001626}, author = {DiCarlo, James J and Johnson, Kenneth O} }