@booklet {185, title = {How does the primate brain combine generative and discriminative computations in vision?}, year = {2024}, abstract = {

Vision is widely understood as an inference problem. However, two contrasting conceptions of the inference process have each been influential in research on biological vision as well as the engineering of machine vision. The first emphasizes bottom-up signal flow, describing vision as a largely feedforward, discriminative inference process that filters and transforms the visual information to remove irrelevant variation and represent behaviorally relevant information in a format suitable for downstream functions of cognition and behavioral control. In this conception, vision is driven by the sensory data, and perception is direct because the processing proceeds from the data to the latent variables of interest. The notion of \"inference\" in this conception is that of the engineering literature on neural networks, where feedforward convolutional neural networks processing images are said to perform inference. The alternative conception is that of vision as an inference process in Helmholtz\&$\#$39;s sense, where the sensory evidence is evaluated in the context of a generative model of the causal processes giving rise to it. In this conception, vision inverts a generative model through an interrogation of the evidence in a process often thought to involve top-down predictions of sensory data to evaluate the likelihood of alternative hypotheses. The authors include scientists rooted in roughly equal numbers in each of the conceptions and motivated to overcome what might be a false dichotomy between them and engage the other perspective in the realm of theory and experiment. The primate brain employs an unknown algorithm that may combine the advantages of both conceptions. We explain and clarify the terminology, review the key empirical evidence, and propose an empirical research program that transcends the dichotomy and sets the stage for revealing the mysterious hybrid algorithm of primate vision.

}, keywords = {discriminative model, generativemodel, Primatevision, visual inference}, doi = { https://doi.org/10.48550/arXiv.2401.06005}, url = {https://arxiv.org/pdf/2401.06005.pdf}, author = {Benjamin Peters and James J. DiCarlo and Todd Gureckis and Ralf Haefner and Leyla Isik and Joshua Tenenbaum and Talia Konkle and Thomas Naselaris and Kimberly Stachenfeld and Zenna Tavares and Doris Tsao and Ilker Yildirim and Nikolaus Kriegeskorte} } @conference {186, title = {Learning only a handful of latent variables produces neural-aligned CNN models of the ventral stream}, booktitle = {Computational and Systems Neuroscience (COSYNE) }, year = {2024}, month = {02.2024}, publisher = {Computational and Systems Neuroscience (COSYNE)}, organization = {Computational and Systems Neuroscience (COSYNE)}, address = {Lisbon, Portugal}, abstract = {

Image-computable modeling of primate ventral stream visual processing has made great strides via brainmapped versions of convolutional neural networks (CNNs) that are optimized on thousands of object categories (ImageNet), the performance of which strongly predicts CNNs\’ neural alignment. However, human and primate visual intelligence extends far beyond object categorization, encompassing a diverse range of tasks, such as estimating the latent variables of object position or pose in the image. The influence of task choice on neural alignment in CNNs, compared to CNN architecture, remains underexplored, partly due to the scarcity of largescale datasets with rich known labels beyond categories. 3D graphic engines, capable of creating training images with detailed information on various latent variables, offer a solution. Here, we asked how the choice of visual tasks that are used to train CNNs (i.e., the set of latent variables to be estimated) affects their ventral stream neural alignment. We focused on the estimation of variables such as object position and pose, and we tested CNNs\’ neural alignment via the Brain-Score open science platform. We found some of these CNNs had neural alignment scores that were very close to those trained on ImageNet, even though their entire training experience has been on synthetic images. Additionally, we found training models on just a handful of latent variables achieved the same level of neural alignment as models trained on a much larger number of categories, suggesting that latent variable training is more efficient than category training in driving model-neural alignment. Moreover, we found that these models\’ neural alignment scores scale with the amount of synthetic data used during training, suggesting the potential of obtaining more aligned models with larger synthetic datasets. This study highlights the effectiveness of using synthetic datasets and latent variables in advancing image-computable models of the ventral visual stream.

}, doi = {https://hdl.handle.net/1721.1/153744}, author = {Xie, Yudi and Alter, Esther and Schwartz, Jeremy and DiCarlo, James J} } @article {174, title = {Catalyzing next-generation Artificial Intelligence through NeuroAI}, journal = {Nature Communications}, volume = {14}, year = {2023}, month = {Jan-12-2023}, pages = {1597}, abstract = {

Neuroscience has long been an essential driver of progress in artificial intelligence (AI). We propose that to accelerate progress in AI, we must invest in fundamental research in NeuroAI. A core component of this is the embodied Turing test, which challenges AI animal models to interact with the sensorimotor world at skill levels akin to their living counterparts. The embodied Turing test shifts the focus from those capabilities like game playing and language that are especially well-developed or uniquely human to those capabilities \– inherited from over 500 million years of evolution \– that are shared with all animals. Building models that can pass the embodied Turing test will provide a roadmap for the next generation of AI.

}, doi = {10.1038/s41467-023-37180-x}, url = {https://www.nature.com/articles/s41467-023-37180-xhttps://www.nature.com/articles/s41467-023-37180-x.pdfhttps://www.nature.com/articles/s41467-023-37180-x.pdfhttps://www.nature.com/articles/s41467-023-37180-x}, author = {Zador, Anthony and Escola, Sean and Richards, Blake and {\"O}lveczky, Bence and Bengio, Yoshua and Boahen, Kwabena and Botvinick, Matthew and Chklovskii, Dmitri and Churchland, Anne and Clopath, Claudia and DiCarlo, James and Ganguli, Surya and Hawkins, Jeff and {\"o}rding, Konrad and Koulakov, Alexei and LeCun, Yann and Lillicrap, Timothy and Marblestone, Adam and Olshausen, Bruno and Pouget, Alexandre and Savin, Cristina and Sejnowski, Terrence and Simoncelli, Eero and Solla, Sara and Sussillo, David and Tolias, Andreas S. and Tsao, Doris} } @article {172, title = {An empirical assay of view-invariant object learning in humans and comparison with baseline image-computable models}, journal = {bioRxiv}, year = {2023}, month = {2023/01/01}, pages = {2022.12.31.522402}, type = {preprint}, abstract = {

How humans learn new visual objects is a longstanding scientific problem. Previous work has led to a diverse collection of models for how it is accomplished, but a current limitation in the field is a lack of empirical benchmarks which can be used to evaluate and compare specific models against each other. Here, we use online psychophysics to measure human behavioral learning trajectories over a set of tasks involving novel 3D objects. Consistent with intuition, these results show that humans generally require very few images (\≈ 6) to approach their asymptotic accuracy, find some object discriminations more easy to learn than others, and generalize quite well over a range of image transformations after even one view of each object. We then use those data to develop benchmarks that may be used to evaluate a learning model\’s similarity to humans. We make these data and benchmarks publicly available [GitHub], and, to our knowledge, they are currently the largest publicly-available collection of learning-related psychophysics data in humans. Additionally, to serve as baselines for those benchmarks, we implement and test a large number of baseline models (n=1,932), each based on a standard cognitive theory of learning: that humans re-represent images in a fixed, Euclidean space, then learn linear decision boundaries in that space to identify objects in future images. We find some of these baseline models make surprisingly accurate predictions. However, we also find reliable prediction gaps between all baseline models and humans, particularly in the few-shot learning setting.Competing Interest StatementThe authors have declared no competing interest.

}, doi = {https://www.biorxiv.org/content/10.1101/2022.12.31.522402v1}, url = {https://www.biorxiv.org/content/biorxiv/early/2023/01/02/2022.12.31.522402.full.pdf}, author = {Lee, Michael J. and DiCarlo, James J.} } @article {178, title = {fROI-level computational models enable broad-scale experimental testing and expose key divergences between models and brains}, journal = {Journal of Vision}, volume = {23}, year = {2023}, month = {2023}, pages = {5788 - 5788}, abstract = {

Deep convolutional neural network (DNN)- based models have emerged as our leading hypotheses of human vision. Here we describe, and expand upon, our latest effort to use DNN models of brain regions to explain key results from previous cognitive neuroscience and psychology experiments. Many stimuli in these prior experiments were highly manipulated (e.g. scrambled body parts, face parts, re-arranged spatial positions) often outside the domain of natural stimuli. These results can therefore be considered as tests of model generalization beyond naturalistic stimuli. We first performed these tests on the fusiform face area (FFA), parahippocampal place area (PPA) and the extrastriate body area (EBA). Our previous results (presented in VSS2022) showed that our fROI-level models recapitulate several key results from prior studies. We also observed that models did not perform as well on non-naturalistic stimuli. Here we extend our model evaluation metrics in two ways. First, we replicated findings from the original paper on the EBA (Downing et al 2001) that the EBA responds as highly to line drawings of bodies, and symbolic stick figures as to natural images of bodies (and not to control conditions like faces and objects). Second, we find that none of the computational models explain this pattern of observed responses, though models trained with language-based supervision (like CLIP) do better than other models. Together, our results on symbolic body images expose the bounds of current computational models. This progress was made possible only because of fROI-level modeling procedures, and opens up new ways to understand the power and limitations of current models and test novel hypotheses completely in-silico.

}, isbn = {1534-7362}, url = {https://doi.org/10.1167/jov.23.9.5788}, author = {Mieczkowski, Elizabeth and Abate, Alex and De Faria, Willian and Lydic, Kirsten and DiCarlo, James and Kanwisher, Nancy and Apurva Ratan Murty, N.} } @article {183, title = {How well do rudimentary plasticity rules predict adult visual object learning?}, journal = {PLOS Computational Biology}, volume = {19}, year = {2023}, month = {Nov-12-2023}, pages = {e1011713}, abstract = {

A core problem in visual object learning is using a finite number of images of a new object to accurately identify that object in future, novel images.

One longstanding, conceptual hypothesis asserts that this core problem is solved by adult brains through two connected mechanisms: 1) the re-representation of incoming retinal images as points in a fixed, multidimensional neural space, and 2) the optimization of linear decision boundaries in that space, via simple plasticity rules applied to a single downstream layer.

Though this scheme is biologically plausible, the extent to which it explains learning behavior in humans has been unclear\—in part because of a historical lack of image-computable models of the putative neural space, and in part because of a lack of measurements of human learning behaviors in difficult, naturalistic settings.

Here, we addressed these gaps by 1) drawing from contemporary, image-computable models of the primate ventral visual stream to create a large set of testable learning models (n = 2,408 models), and 2) using online psychophysics to measure human learning trajectories over a varied set of tasks involving novel 3D objects (n = 371,000 trials), which we then used to develop (and\ publicly release) empirical benchmarks for comparing learning models to humans.

We evaluated each learning model on these benchmarks, and found those based on deep, high-level representations from neural networks were surprisingly aligned with human behavior. While no tested model explained the entirety of replicable human behavior, these results establish that rudimentary plasticity rules, when combined with appropriate visual representations, have high explanatory power in predicting human behavior with respect to this core object learning problem.

}, issn = {1553-734X}, doi = {10.1371/journal.pcbi.1011713}, url = {https://dx.plos.org/10.1371/journal.pcbi.1011713}, author = {Lee, Michael J. and DiCarlo, James J.}, editor = {Kietzmann, Tim Christian} } @article {184, title = {How well do rudimentary plasticity rules predict adult visual object learning?}, journal = {PLOS Computational Biology}, volume = {19}, year = {2023}, month = {Nov-12-2023}, pages = {e1011713}, abstract = {

A core problem in visual object learning is using a finite number of images of a new object to accurately identify that object in future, novel images.

One longstanding, conceptual hypothesis asserts that this core problem is solved by adult brains through two connected mechanisms: 1) the re-representation of incoming retinal images as points in a fixed, multidimensional neural space, and 2) the optimization of linear decision boundaries in that space, via simple plasticity rules applied to a single downstream layer.

Though this scheme is biologically plausible, the extent to which it explains learning behavior in humans has been unclear\—in part because of a historical lack of image-computable models of the putative neural space, and in part because of a lack of measurements of human learning behaviors in difficult, naturalistic settings.

Here, we addressed these gaps by 1) drawing from contemporary, image-computable models of the primate ventral visual stream to create a large set of testable learning models (n = 2,408 models), and 2) using online psychophysics to measure human learning trajectories over a varied set of tasks involving novel 3D objects (n = 371,000 trials), which we then used to develop (and\ publicly release) empirical benchmarks for comparing learning models to humans.

We evaluated each learning model on these benchmarks, and found those based on deep, high-level representations from neural networks were surprisingly aligned with human behavior. While no tested model explained the entirety of replicable human behavior, these results establish that rudimentary plasticity rules, when combined with appropriate visual representations, have high explanatory power in predicting human behavior with respect to this core object learning problem.

}, issn = {1553-734X}, doi = {10.1371/journal.pcbi.1011713}, url = {https://dx.plos.org/10.1371/journal.pcbi.1011713}, author = {Lee, Michael J. and DiCarlo, James J.}, editor = {Kietzmann, Tim Christian} } @article {180, title = {Let{\textquoteright}s move forward: Image-computable models and a common model evaluation scheme are prerequisites for a scientific understanding of human visionAbstract}, journal = {Behavioral and Brain Sciences}, volume = {4634}, year = {2023}, month = {Jan-01-2023}, abstract = {

In the target article, Bowers et al. dispute deep artificial neural network (ANN) models as the currently leading models of human vision without producing alternatives. They eschew the use of public benchmarking platforms to compare vision models with the brain and behavior, and they advocate for a fragmented, phenomenon-specific modeling approach. These are unconstructive to scientific progress. We outline how the Brain-Score community is moving forward to add new model-to-human comparisons to its community-transparent suite of benchmarks.

}, issn = {0140-525X}, doi = {10.1017/S0140525X23001607}, url = {https://www.cambridge.org/core/product/identifier/S0140525X23001607/type/journal_article}, author = {DiCarlo, James J. and Yamins, Daniel L. K. and Ferguson, Michael E. and Fedorenko, Evelina and Bethge, Matthias and Bonnen, Tyler and Schrimpf, Martin} } @conference {182, title = {Probing Biological and Artificial Neural Networks with Task-dependent Neural Manifolds}, booktitle = {Conference on Parsimony and Learning (Proceedings Track)}, year = {2023}, address = {Hong Kong, China}, abstract = {

Recently, growth in our understanding of the computations performed in both biological and artificial neural networks has largely been driven by either low-level mechanistic studies or global normative approaches. However, concrete methodologies for bridging the gap between these levels of abstraction remain elusive. In this work, we investigate the internal mechanisms of neural networks through the lens of neural population geometry, aiming to provide understanding at an intermediate level of abstraction, as a way to bridge that gap. Utilizing manifold capacity theory (MCT) from statistical physics and manifold alignment analysis (MAA) from high-dimensional statistics, we probe the underlying organization of task-dependent manifolds in deep neural networks and macaque neural recordings. Specifically, we quantitatively characterize how different learning objectives lead to differences in the organizational strategies of these models and demonstrate how these geometric analyses are connected to the decodability of task-relevant information. These analyses present a strong direction for bridging mechanistic and normative theories in neural networks through neural population geometry, potentially opening up many future research avenues in both machine learning and neuroscience.

}, keywords = {Biologically inspired vision models, computational neuroscience, Neural Geometry, Neural Manifolds, Neuro-AI, Representational Geometry}, url = {https://openreview.net/forum?id=MxBS6aw5Gd}, author = {Michael Kuoch and Chi-Ning Chou and Nikhil Parthasarathy and Joel Dapello and James J. DiCarlo and Haim Sompolinsky and Sueyeon Chung} } @article {181, title = {The Quest for an Integrated Set of Neural Mechanisms Underlying Object Recognition in Primates}, journal = {arXiv}, year = {2023}, month = {12/10/2023}, type = {preprint}, abstract = {

Visual object recognition -- the behavioral ability to rapidly and accurately categorize many visually encountered objects -- is core to primate cognition. This behavioral capability is algorithmically impressive because of the myriad identity-preserving viewpoints and scenes that dramatically change the visual image produced by the same object. Until recently, the brain mechanisms that support that capability were deeply mysterious. However, over the last decade, this scientific mystery has been illuminated by the discovery and development of brain-inspired, image-computable, artificial neural network (ANN) systems that rival primates in this behavioral feat. Apart from fundamentally changing the landscape of artificial intelligence (AI), modified versions of these ANN systems are the current leading scientific hypotheses of an integrated set of mechanisms in the primate ventral visual stream that support object recognition. What separates brain-mapped versions of these systems from prior conceptual models is that they are Sensory-computable, Mechanistic, Anatomically Referenced, and Testable (SMART). Here, we review and provide perspective on the brain mechanisms that the currently leading SMART models address. We review the empirical brain and behavioral alignment successes and failures of those current models. Given ongoing advances in neurobehavioral measurements and AI, we discuss the next frontiers for even more accurate mechanistic understanding. And we outline the likely applications of that SMART-model-based understanding.

}, doi = { https://doi.org/10.48550/arXiv.2312.05956}, url = {https://arxiv.org/pdf/2312.05956.pdf}, author = {Kohitij Kar and James J DiCarlo} } @article {175, title = {Robustified ANNs Reveal Wormholes Between Human Category Percepts}, journal = {arXiv}, year = {2023}, month = {08/2023}, type = {preprint}, abstract = {

The visual object category reports of artificial neural networks (ANNs) are notoriously sensitive to tiny, adversarial image perturbations. Because human category reports (aka human percepts) are thought to be insensitive to those same small-norm perturbations -- and locally stable in general -- this argues that ANNs are incomplete scientific models of human visual perception. Consistent with this, we show that when small-norm image perturbations are generated by standard ANN models, human object category percepts are indeed highly stable. However, in this very same \"human-presumed-stable\" regime, we find that robustified ANNs reliably discover low-norm image perturbations that strongly disrupt human percepts. These previously undetectable human perceptual disruptions are massive in amplitude, approaching the same level of sensitivity seen in robustified ANNs. Further, we show that robustified ANNs support precise perceptual state interventions: they guide the construction of low-norm image perturbations that strongly alter human category percepts toward specific prescribed percepts. These observations suggest that for arbitrary starting points in image space, there exists a set of nearby \"wormholes\", each leading the subject from their current category perceptual state into a semantically very different state. Moreover, contemporary ANN models of biological visual processing are now accurate enough to consistently guide us to those portals.

}, doi = { https://doi.org/10.48550/arXiv.2308.06887 Focus to learn more}, url = {https://arxiv.org/pdf/2308.06887.pdf}, author = {Gaziv, Guy and Lee, Michael J and DiCarlo, James J} } @conference {179, title = {Strong and Precise Modulation of Human Percepts via Robustified ANNs}, booktitle = {Neural Information Processing Systems}, year = {2023}, address = {New Orleans, Louisiana}, abstract = {

The visual object category reports of artificial neural networks (ANNs) are notoriously sensitive to tiny, adversarial image perturbations. Because human category reports (aka human percepts) are thought to be insensitive to those same small-norm perturbations \– and locally stable in general \– this argues that ANNs are incomplete scientific models of human visual perception. Consistent with this, we show that when small-norm image perturbations are generated by standard ANN models, human object category percepts are indeed highly stable. However, in this very same \“human-presumed-stable\” regime, we find that robustified ANNs reliably discover low-norm image perturbations that strongly disrupt human percepts. These previously undetectable human perceptual disruptions are massive in amplitude, approaching the same level of sensitivity seen in robustified ANNs. Further, we show that robustified ANNs support precise perceptual state interventions: they guide the construction of low-norm image perturbations that strongly alter human category percepts toward specific prescribed percepts. In sum, these contemporary models of biological visual processing are now accurate enough to guide strong and precise interventions on human perception.

}, url = {https://openreview.net/pdf?id=5GmTI4LNqX}, author = {Gaziv, Guy and Lee, Michael J and DiCarlo, James J} } @article {177, title = {A Unifying Principle for the Functional Organization of Visual Cortex}, journal = {bioRxiv}, year = {2023}, month = {2023}, abstract = {

A key feature of many cortical systems is functional organization: the arrangement of neurons with specific functional properties in characteristic spatial patterns across the cortical surface. However, the principles\ underlying the emergence and utility of functional organization are poorly understood. Here we develop\ the Topographic Deep Artificial Neural Network (TDANN), the first unified model to accurately predict the\ functional organization of multiple cortical areas in the primate visual system. We analyze the key factorsresponsible for the TDANN\’s success and find that it strikes a balance between two specific objectives:achieving a task-general sensory representation that is self-supervised, and maximizing the smoothness of\ responses across the cortical sheet according to a metric that scales relative to cortical surface area. In\ turn, the representations learned by the TDANN are lower dimensional and more brain-like than those in\ models that lack a spatial smoothness constraint. Finally, we provide evidence that the TDANN\’s functionalorganization balances performance with inter-area connection length, and use the resulting models for\ a proof-of-principle optimization of cortical prosthetic design. Our results thus offer a unified principle\ for understanding functional organization and a novel view of the functional role of the visual system in\ particular.

}, doi = { https://doi.org/10.1101/2023.05.18.541361}, url = {https://www.biorxiv.org/content/10.1101/2023.05.18.541361v1.full.pdf}, author = {Margalit, Eshed and Lee, Hyodong and Finzi, Dawn and DiCarlo, James J and Grill-Spector, Kalanit and Yamins, Daniel LK} } @article {170, title = {Adversarially trained neural representations may already be as robust as corresponding biological neural representations}, journal = {arXiv}, year = {2022}, month = {06/19/2022}, type = {preprint}, abstract = {

Visual systems of primates are the gold standard of robust perception. There is thus a general belief that mimicking the neural representations that underlie those systems will yield artificial visual systems that are adversarially robust. In this work, we develop a method for performing adversarial visual attacks directly on primate brain activity. We then leverage this method to demonstrate that the above-mentioned belief might not be well founded. Specifically, we report that the biological neurons that make up visual systems of primates exhibit susceptibility to adversarial perturbations that is comparable in magnitude to existing (robustly trained) artificial neural networks.

}, doi = {https://doi.org/10.48550/arXiv.2206.11228}, url = {https://arxiv.org/abs/2206.11228v1}, author = {Guo, Chong and Lee, Michael J and Leclerc, Guillaume and Dapello, Joel and Rao, Yug and Madry, Aleksander and DiCarlo, James J} } @article {171, title = {Aligning Model and Macaque Inferior Temporal Cortex Representations Improves Model-to-Human Behavioral Alignment and Adversarial Robustness}, journal = {bioRxiv}, year = {2022}, month = {July 4, 20202}, type = {preprint}, abstract = {

While some state-of-the-art artificial neural network systems in computer vision are strikingly accurate models of the corresponding primate visual processing, there are still many discrepancies between these models and the behavior of primates on object recognition tasks. Many current models suffer from extreme sensitivity to adversarial attacks and often do not align well with the image-by-image behavioral error patterns observed in humans. Previous research has provided strong evidence that primate object recognition behavior can be very accurately predicted by neural population activity in the inferior temporal (IT) cortex, a brain area in the late stages of the visual processing hierarchy. Therefore, here we directly test whether making the late stage representations of models more similar to that of macaque IT produces new models that exhibit more robust, primate-like behavior. We conducted chronic, large-scale multi-electrode recordings across the IT cortex in six non-human primates (rhesus macaques). We then use these data to fine-tune (end-to-end) the model \“IT\” representations such that they are more aligned with the biological IT representations, while preserving accuracy on object recognition tasks. We generate a cohort of models with a range of IT similarity scores validated on held-out animals across two image sets with distinct statistics. Across a battery of optimization conditions, we observed a strong correlation between the models\’ IT-likeness and alignment with human behavior, as well as an increase in its adversarial robustness. We further assessed the limitations of this approach and find that the improvements in behavioral alignment and adversarial robustness generalize across different image statistics, but not to object categories outside of those covered in our IT training set. Taken together, our results demonstrate that building models that are more aligned with the primate brain leads to more robust and human-like behavior, and call for larger neural data-sets to further augment these gains.Competing Interest StatementThe authors have declared no competing interest.

}, doi = {https://doi.org/10.1101/2022.07.01.498495}, url = {https://www.biorxiv.org/content/10.1101/2022.07.01.498495v1.full.pdf}, author = {Dapello, Joel and Kar, Kohitij and Schrimpf, Martin and Geary, Robert and Ferguson, Michael and Cox, David D. and DiCarlo, James J.} } @conference {173, title = {Primate Inferotemporal Cortex Neurons Generalize Better to Novel Image Distributions Than Analogous Deep Neural Networks Units}, booktitle = {SVHRM Workshop at Neural Information Processing Systems (NeurIPS)}, year = {2022}, month = {2022 }, address = {Lisbon, Portugal}, abstract = {

Humans are successfully able to recognize objects in a variety of image distributions. Today\&$\#$39;s artificial neural networks (ANNs), on the other hand, struggle to recognize objects in many image domains, especially those different from the training distribution. It is currently unclear which parts of the ANNs could be improved in order to close this generalization gap. In this work, we used recordings from primate high-level visual cortex (IT) to isolate whether ANNs lag behind primate generalization capabilities because of their encoder (transformations up to the penultimate layer), or their decoder (linear transformation into class labels). Specifically, we fit a linear decoder on images from one domain and evaluate transfer performance on twelve held-out domains, comparing fitting on primate IT representations vs. representations in ANN penultimate layers. To fairly compare, we scale the number of each ANN\&$\#$39;s units so that its in-domain performance matches that of the sampled IT population (i.e. 71 IT neural sites, 73\% binary-choice accuracy). We find that the sampled primate population achieves, on average, 68\% performance on the held-out-domains. Comparably sampled populations from ANN model units generalize less well, maintaining on average 60\%. This is independent of the number of sampled units: models\&$\#$39; out-of-domain accuracies consistently lag behind primate IT. These results suggest that making ANN model units more like primate IT will improve the generalization performance of ANNs.

}, url = {https://openreview.net/pdf?id=iPF7mhoWkOl}, author = {Bagus, Ayu Marliawaty I Gusti and Marques, Tiago and Sanghavi, Sachi and DiCarlo, James J and Schrimpf, Martin} } @conference {146, title = {Wiring Up Vision: Minimizing Supervised Synaptic Updates Needed to Produce a Primate Ventral Stream}, booktitle = {International Conference on Learning Representations 2022 Spotlight}, year = {2022}, month = {April 25, 2022}, type = {preprint}, abstract = {

After training on large datasets, certain deep neural networks are surprisingly good models of the neural mechanisms of adult primate visual object recognition. Nevertheless, these models are considered poor models of the development of the visual system because they posit millions of sequential, precisely coordinated synaptic updates, each based on a labeled image. While ongoing research is pursuing the use of unsupervised proxies for labels, we here explore a complementary strategy of reducing the required number of supervised synaptic updates to produce an adult-like ventral visual stream (as judged by the match to V1, V2, V4, IT, and behavior). Such models might require less precise machinery and energy expenditure to coordinate these updates and would thus move us closer to viable neuroscientific hypotheses about how the visual system wires itself up. Relative to standard model training on labeled images in ImageNet, we here demonstrate that the total number of supervised weight updates can be substantially reduced using three complementary strategies: First, we find that only 2\% of supervised updates (epochs and images) are needed to achieve \∼80\% of a fully trained model\’s match to adult ventral stream. Specifically, training benefits predictions of higher visual cortex the most whereas predictions of earlier areas improve only marginally over the course of training. Second, by improving the random distribution of synaptic connectivity, we find that 54\% of the brain match can already be achieved \“at birth\” (i.e. no training at all). Third, we find that, by training only \∼5\% of model synapses, we can still achieve nearly 80\% of the match to the ventral stream. This approach further improves on ImageNet performance over previous attempts in computer vision of minimizing trained components without substantially increasing the number of trained parameters. These results reflect first steps in modeling not just primate adult visual processing during inference, but also how the ventral visual stream might be \“wired up\” by evolution (a model\’s \“birth\” state) and by developmental learning (a model\’s updates based on visual experience).

}, keywords = {biologically plausible learning, computational neuroscience, convolutional neural networks, primate visual ventral stream}, doi = {10.1101/2020.06.08.140111}, url = {https://openreview.net/pdf?id=g1SzIRLQXMM}, author = {Geiger, Franziska and Schrimpf, Martin and Marques, Tiago and DiCarlo, James J} } @article {165, title = {Chemogenetic suppression of macaque V4 neurons produces retinotopically specific deficits in downstream IT neural activity patterns and core object recognition behavior}, journal = {Journal of Vision}, volume = {21}, year = {2021}, pages = {2489-2489}, abstract = {

Distributed activity patterns across multiple brain areas (e.g., V4, IT) enable primates to accurately identify visual objects. To strengthen our inferences about the causal role of underlying brain circuits, it is necessary to develop targeted neural perturbation strategies that enable discrimination amongst competing models. To probe the role of area V4 in core object recognition, we expressed inhibitory DREADDs in neurons within a 5x5 mm subregion of V4 cortex via multiple viral injections (AAV8-hSyn-hM4Di-mCherry; two macaques). To assay for successful neural suppression, we recorded from a multi-electrode array implanted over the transfected V4. We also recorded from multi-electrode arrays in the IT cortex (the primary feedforward target of V4), while simultaneously measuring the monkeys\’ behavior during object discrimination tasks. We found that systemic (intramuscular) injection of the DREADDs activator (CNO) produced reversible reductions (~20\%) in image-evoked V4 responses compared to the control condition (saline injections). Monkeys showed significant behavioral performance deficits upon CNO injections (compared to saline), which were larger when the object position overlapped with the RF estimates of the transfected V4 neurons. This is consistent with the hypothesis that the suppressed V4 neurons are critical to this behavior. Furthermore, we observed commensurate deficits in the linearly-decoded estimates of object identity from the IT population activity (post-CNO). To model the perturbed brain circuitry, we used a primate brain-mapped artificial neural network (ANN) model (CORnet-S) that supports object recognition. We \“lesioned\” the model\’s corresponding V4 subregion by modifying its weights such that the responses matched a subset of our experimental V4 measurements (post-CNO). Indeed, the lesioned model better predicted the measured (held-out) V4 and IT responses (post-CNO), compared to the model\&$\#$39;s non-lesioned version, validating our approach. In the future, our approach allows us to discriminate amongst competing mechanistic brain models, while the data provides constraints to guide more accurate alternatives.

}, doi = {https://doi.org/10.1167/jov.21.9.2489}, author = {Kar, Kohitij and Schrimpf, Martin and Schmidt, Kailyn and DiCarlo, JJ} } @article {162, title = {Chronically implantable LED arrays for behavioral optogenetics in primates}, journal = {Nature Methods}, volume = {18}, year = {2021}, month = {Jan-09-2021}, pages = {1112 - 1116}, abstract = {

Optogenetic methods have been widely used in rodent brains, but remain relatively under-developed for nonhuman primates such as rhesus macaques, an animal model with a large brain expressing sophisticated sensory, motor and cognitive behaviors. To address challenges in behavioral optogenetics in large brains, we developed Opto-Array, a chronically implantable array of light-emitting diodes for high-throughput optogenetic perturbation. We demonstrated that optogenetic silencing in the macaque primary visual cortex with the help of the Opto-Array results in reliable retinotopic visual deficits in a luminance discrimination task. We separately confirmed that Opto-Array illumination results in local neural silencing, and that behavioral effects are not due to tissue heating. These results demonstrate the effectiveness of the Opto-Array for behavioral optogenetic applications in large brains.

}, issn = {1548-7091}, doi = {10.1038/s41592-021-01238-9}, url = {https://www.nature.com/articles/s41592-021-01238-9}, author = {Rajalingham, Rishi and Sorenson, Michael and Azadi, Reza and Bohn, Simon and DiCarlo, James J. and Afraz, Arash} } @proceedings {163, title = {Combining Different V1 Brain Model Variants to Improve Robustness to Image Corruptions in CNNs}, journal = {Shared Visual Representations in Human \& Machine Intelligence - NeurIPS Workshop}, year = {2021}, month = {October 20, 2021}, publisher = {Neural Information Processing Systems}, abstract = {

While some convolutional neural networks (CNNs) have surpassed human visual abilities in object classification, they often struggle to recognize objects in images corrupted with different types of common noise patterns, highlighting a major limitation of this family of models. Recently, it has been shown that simulating a primary visual cortex (V1) at the front of CNNs leads to small improvements in robustness to these image perturbations. In this study, we start with the observation that different variants of the V1 model show gains for specific corruption types. We then build a new model using an ensembling technique, which combines multiple individual models with different V1 front-end variants. The model ensemble leverages the strengths of each individual model, leading to significant improvements in robustness across all corruption categories and outperforming the base model by 38\% on average. Finally, we show that using distillation, it is possible to partially compress the knowledge in the ensemble model into a single model with a V1 front-end. While the ensembling and distillation techniques used here are hardly biologically-plausible, the results presented here demonstrate that by combining the specific strengths of different neuronal circuits in V1 it is possible to improve the robustness of CNNs for a wide range of perturbations.

}, url = {https://arxiv.org/abs/2110.10645}, author = {Baidya, Avinash and Dapello, Joel and DiCarlo, James J and Marques, Tiago} } @article {161, title = {Computational models of category-selective brain regions enable high-throughput tests of selectivity}, journal = {Nature Communications}, volume = {12}, year = {2021}, month = {Jan-12-2021}, abstract = {

Cortical regions apparently selective to faces, places, and bodies have provided important evidence for domain-specific theories of human cognition, development, and evolution. But claims of category selectivity are not quantitatively precise and remain vulnerable to empirical refutation. Here we develop artificial neural network-based encoding models that accurately predict the response to novel images in the fusiform face area, parahippocampal place area, and extrastriate body area, outperforming descriptive models and experts. We use these models to subject claims of category selectivity to strong tests, by screening for and synthesizing images predicted to produce high responses. We find that these high-responsepredicted images are all unambiguous members of the hypothesized preferred category for each region. These results provide accurate, image-computable encoding models of each category-selective region, strengthen evidence for domain specificity in the brain, and point the way for future research characterizing the functional organization of the brain with unprecedented computational precision.

}, doi = {10.1038/s41467-021-25409-6}, url = {https://www.nature.com/articles/s41467-021-25409-6}, author = {Murty, NAR and Bashivan, Pouya and Abate, Alex and DiCarlo, James J. and Kanwisher, Nancy} } @article {153, title = {Fast Recurrent Processing Via Ventral Prefrontal Cortex is Needed by the Primate Ventral Stream for Robust Core Visual Object Recognition}, journal = {Neuron}, volume = {109}, year = {2021}, pages = {164-167.e5}, chapter = {164}, abstract = {

Distributed neural population spiking patterns in macaque inferior temporal (IT) cortex that support core visual object recognition require additional time to develop for specific (\“late-solved\”) images suggesting the necessity of recurrent processing in these computations. Which brain circuit motifs are most responsible for computing and transmitting these putative recurrent signals to IT? To test whether the ventral prefrontal cortex (vPFC) is a critical recurrent circuit node in this system, here we pharmacologically inactivated parts of the vPFC and simultaneously measured IT population activity, while monkeys performed object discrimination tasks. Our results show that vPFC inactivation deteriorated the quality of the late-phase (\>150 ms from image onset) IT population code, along with commensurate, specific behavioral deficits for \“late-solved\” images. Finally, silencing vPFC caused the monkeys\’ IT activity patterns and behavior to become more like those produced by feedforward artificial neural network models of the ventral stream. Together with prior work, these results argue that fast recurrent processing through the vPFC is critical to the production of behaviorally-sufficient object representations in IT.

}, doi = {https://doi.org/10.1016/j.neuron.2020.09.035}, url = {https://www.cell.com/neuron/pdf/S0896-6273(20)30759-5.pdf}, author = {Kar, Kohitij and DiCarlo, James J.} } @article {157, title = {Multi-scale hierarchical neural network models that bridge from single neurons in the primate primary visual cortex to object recognition behavior}, journal = {bioRxiv}, year = {2021}, month = {03/01/2021}, type = {preprint}, abstract = {Object recognition relies on inferior temporal (IT) cortical neural population representations that are themselves computed by a hierarchical network of feedforward and recurrently connected neural population called the ventral visual stream (areas V1, V2, V4 and IT). While recent work has created some reasonably accurate image-computable hierarchical neural network models of those neural stages, those models do not yet bridge between the properties of individual neurons and the overall emergent behavior of the ventral stream. For example, current leading ventral stream models do not allow us to ask questions such as: How does the surround suppression behavior of individual V1 neurons ultimately relate to IT neural representation and to behavior?; or How would deactivation of a particular sub-population of V1 neurons specifically alter object recognition behavior? One reason we cannot yet do this is that individual V1 artificial neurons in multi-stage models have not been shown to be functionally similar with individual biological V1 neurons. Here, we took an important first step towards this direction by building and evaluating hundreds of hierarchical neural network models in how well their artificial single neurons approximate macaque primary visual cortical (V1) neurons. We found that single neurons in some models are surprisingly similar to their biological counterparts and that the distributions of single neuron properties, such as those related to orientation and spatial frequency tuning, approximately match those in macaque V1. Crucially, we also observed that hierarchical models with V1-layers that better match macaque V1 at the single neuron level are also more aligned with human object recognition behavior. These results provide the first multi-stage, multi-scale models that allow our field to ask precisely how the specific properties of individual V1 neurons relate to recognition behavior. Finally, we here show that an optimized classical neuroscientific model of V1 is still more functionally similar to primate V1 than all of the tested multi-stage models, suggesting that further model improvements are possible, and that those improvements would likely have tangible payoffs in terms of behavioral prediction accuracy and behavioral robustness.Single neurons in some image-computable hierarchical neural network models are functionally similar to single neurons in macaque primate visual cortex (V1)Some hierarchical neural networks models have V1 layers that better match the biological distributions of macaque V1 single neuron response propertiesMulti-stage hierarchical neural network models with V1 stages that better match macaque V1 are also more aligned with human object recognition behavior at their output stageCompeting Interest StatementThe authors have declared no competing interest.}, doi = {10.1101/2021.03.01.433495}, author = {Tiago Marques and Martin Schrimpf and James J DiCarlo} } @conference {169, title = {Neural Population Geometry Reveals the Role of Stochasticity in Robust Perception}, booktitle = {Neural Information Processing Systems (NeurIPS)}, year = {2021}, address = {Lisbon, Portugal}, abstract = {

Adversarial examples are often cited by neuroscientists and machine learning researchers as an example of how computational models diverge from biological sensory systems. Recent work has proposed adding biologically-inspired components to visual neural networks as a way to improve their adversarial robustness. One surprisingly effective component for reducing adversarial vulnerability is response stochasticity, like that exhibited by biological neurons. Here, using recently developed geometrical techniques from computational neuroscience, we investigate how adversarial perturbations influence the internal representations of standard, adversarially trained, and biologically-inspired stochastic networks. We find distinct geometric signatures for each type of network, revealing different mechanisms for achieving robust representations. Next, we generalize these results to the auditory domain, showing that neural stochasticity also makes auditory models more robust to adversarial perturbations. Geometric analysis of the stochastic networks reveals overlap between representations of clean and adversarially perturbed stimuli, and quantitatively demonstrate that competing geometric effects of stochasticity mediate a tradeoff between adversarial and clean performance. Our results shed light on the strategies of robust perception utilized by adversarially trained and stochastic networks, and help explain how stochasticity may be beneficial to machine and biological computation.

}, url = {https://proceedings.neurips.cc/paper/2021/file/8383f931b0cefcc631f070480ef340e1-Paper.pdf}, author = {Joel Dapello and Jenelle Feather and Tiago Marques and David Cox and Josh McDermott and James J DiCarlo and Sueyeon Chung} } @article {158, title = {The ThreeDWorld Transport Challenge: A Visually Guided Task-and-Motion Planning Benchmark for Physically Realistic Embodied AI}, journal = {arXiv}, year = {2021}, month = {03/25/2021}, type = {preprint}, abstract = {

We introduce a visually-guided and physics-driven taskand-motion planning benchmark, which we call the ThreeDWorld Transport Challenge. In this challenge, an embodied agent equipped with two 9-DOF articulated arms is spawned randomly in a simulated physical home environment. The agent is required to find a small set of objects scattered around the house, pick them up, and transport them to a desired final location. We also position containers around the house that can be used as tools to assist with transporting objects efficiently. To complete the task, an embodied agent must plan a sequence of actions to change the state of a large number of objects in the face of realistic physical constraints. We build this benchmark challenge using the ThreeDWorld simulation: a virtual 3D environment where all objects respond to physics, and where can be controlled using a fully physics-driven navigation and interaction API. We evaluate several existing agents on this benchmark. Experimental results suggest that: 1) a pure RL model struggles on this challenge; 2) hierarchical planningbased agents can transport some objects but still far from solving this task. We anticipate that this benchmark will empower researchers to develop more intelligent physicsdriven robots for the physical world.\ 

}, doi = {arXiv:2103.14025}, url = {https://arxiv.org/abs/2103.14025}, author = {Gan, Chuang and Zhou, Siyuan and Schwartz, Jeremy and Alter, Seth and Bhandwaldar, Abhishek and Gutfreund, Dan and Yamins, Daniel LK and DiCarlo, James J and McDermott, Josh and Torralba, Antonio} } @proceedings {164, title = {Topographic ANNs Predict the Behavioral Effects of Causal Perturbations in Primate Visual Ventral Stream IT}, journal = {Champalimaud Research Symposium (CRS21)}, year = {2021}, address = {Lisbon, Portugal}, author = {Schrimpf, Martin and Mc Grath, Paul and DiCarlo, J J} } @article {160, title = {Unsupervised changes in core object recognition behavior are predicted by neural plasticity in inferior temporal cortex}, journal = {eLife}, volume = {10}, year = {2021}, month = {Nov-06-2021}, abstract = {

Temporal continuity of object identity is a feature of natural visual input, and is potentially exploited -- in an unsupervised manner -- by the ventral visual stream to build the neural representation in inferior temporal (IT) cortex. Here we investigated whether plasticity of individual IT neurons underlies human core-object-recognition behavioral changes induced with unsupervised visual experience. We built a single-neuron plasticity model combined with a previously established IT population-to-recognition-behavior linking model to predict human learning effects. We found that our model, after constrained by neurophysiological data, largely predicted the mean direction, magnitude and time course of human performance changes. We also found a previously unreported dependency of the observed human performance change on the initial task difficulty. This result adds support to the hypothesis that tolerant core object recognition in human and non-human primates is instructed -- at least in part -- by naturally occurring unsupervised temporal contiguity experience.

}, doi = {10.7554/eLife.60830}, url = {https://elifesciences.org/articles/60830https://cdn.elifesciences.org/articles/60830/elife-60830-v2.pdfhttps://cdn.elifesciences.org/articles/60830/elife-60830-v2.xml}, author = {Jia, Xiaoxuan and Hong, Ha and DiCarlo, James J} } @article {167, title = {Unsupervised neural network models of the ventral visual stream}, journal = {Proceedings of the National Academy of Sciences}, volume = {118}, year = {2021}, month = {Jul-01-2022}, pages = {e2014196118}, abstract = {

Deep neural networks currently provide the best quantitative models of the response patterns of neurons throughout the primate ventral visual stream. However, such networks have remained implausible as a model of the development of the ventral stream, in part because they are trained with supervised methods requiring many more labels than are accessible to infants during development. Here, we report that recent rapid progress in unsupervised learning has largely closed this gap. We find that neural network models learned with deep unsupervised contrastive embedding methods achieve neural prediction accuracy in multiple ventral visual cortical areas that equals or exceeds that of models derived using today\’s best supervised methods and that the mapping of these neural network models\’ hidden layers is neuroanatomically consistent across the ventral stream. Strikingly, we find that these methods produce brain-like representations even when trained solely with real human child developmental data collected from head-mounted cameras, despite the fact that these datasets are noisy and limited. We also find that semisupervised deep contrastive embeddings can leverage small numbers of labeled examples to produce representations with substantially improved error-pattern consistency to human behavior. Taken together, these results illustrate a use of unsupervised learning to provide a quantitative model of a multiarea cortical brain system and present a strong candidate for a biologically plausible computational theory of primate sensory learning.

}, issn = {0027-8424}, doi = {10.1073/pnas.2014196118}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.2014196118}, author = {Zhuang, Chengxu and Yan, Siming and Nayebi, Aran and Schrimpf, Martin and Frank, Michael C. and DiCarlo, James J. and Yamins, Daniel L. K.} } @article {149, title = {Chronically implantable LED arrays for behavioral optogenetics in primates}, journal = {bioRxiv}, year = {2020}, month = {9/11/2020}, type = {preprint}, abstract = {

Challenges in behavioral optogenetics in large brains demand development of a chronically implantable platform for light delivery. We have developed Opto-Array, a chronically implantable array of LEDs for high-throughput optogenetic perturbation in non-human primates. We tested the Opto-Array in the primary visual cortex of a macaque monkey, and demonstrated that optogenetic cortical silencing by the Opto-Array results in reliable retinotopic visual deficits on a luminance discrimination task.

}, doi = {10.1101/2020.09.10.291583}, url = {https://www.biorxiv.org/content/10.1101/2020.09.10.291583v1.abstract}, author = {Rajalingham, Rish and Sorenson, Michael and Azadi, Reza and Bohn, Simon and DiCarlo, James J. and Afraz, Arash} } @proceedings {156, title = {Correlation-based spatial layout of deep neural network features generates ventral stream topography}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2020}, month = {2/28/2020}, publisher = {COSYNE}, address = {Denver, CO}, abstract = {

The primate visual system is organized into functional maps, including pinwheel-like arrangements of orientationtuned neurons in primary visual cortex (V1) and patches of category-selective neurons in higher visual cortex. Recent work has demonstrated that deep convolutional neural networks (DCNNs) trained for object recognition are good descriptors of neural representations throughout the ventral pathway, with early, intermediate, and late cortical brain areas best predicted by corresponding layers of the DCNN. Despite this success, DCNNs have no inherent spatial layout for features at a given retinotopic location, and thus, make no predictions regarding many of the characteristic topographic phenomena observed in the brain beyond retinotopy itself, e.g., pinwheels and patches. Cortical map formation has been modeled using self-organizing maps that leverage principles of wiring-length minimization and local correlations of unit responses to produce topographic structure. However, these methods rely on simplified feature parameterizations that limit their ability to accommodate more realistic descriptions of neuron response properties, especially in higher visual areas. Here, we augment DCNNs by assigning model units spatial positions in a 2D \“cortical sheet\” and introduce a novel algorithm to arrange units so that local response correlations are maximized. Applying this algorithm to a categorization-optimized DCNN, we find that layouts generated from earlier layers recapitulate core features of V1 orientation, spatial frequency, and color preference maps, while those generated from later layers naturally exhibit category-selective clusters. Because this wide range of apparently disparate phenomenology is produced by the same underlying principle, our results suggest that the functional architecture of the visual system can be explained by two fundamental constraints: the need to perform visual tasks and the pressure to minimize biophysical costs such as wiring length. Our framework for spatially mapping DCNNs integrates biophysical and representational phenomenology, allowing a more unified understanding of the visual system\’s functional architecture.

}, url = {http://cosyne.org/cosyne20/Cosyne2020_program_book.pdf}, author = {Eshed Margalit and Hyodong Lee and Tiago Marques and James J. DiCarlo and Daniel L.K. Yamins} } @article {6, title = {Fast recurrent processing via ventral prefrontal cortex is needed by the primate ventral stream for robust core visual object recognition}, journal = {BioRxiv}, year = {2020}, month = {05/2020}, type = {preprint}, abstract = {

Distributed neural population spiking patterns in macaque inferior temporal (IT) cortex that support core visual object recognition require additional time to develop for specific (\"late-solved\") images suggesting the necessity of recurrent processing in these computations. Which brain circuit motifs are most responsible for computing and transmitting these putative recurrent signals to IT? To test whether the ventral prefrontal cortex (vPFC) is a critical recurrent circuit node in this system, here we pharmacologically inactivated parts of the vPFC and simultaneously measured IT population activity, while monkeys performed object discrimination tasks. Our results show that vPFC inactivation deteriorated the quality of the late-phase (\>150 ms from image onset) IT population code, along with commensurate, specific behavioral deficits for \"late-solved\" images. Finally, silencing vPFC caused the monkeys\&$\#$39; IT activity patterns and behavior to become more like those produced by feedforward artificial neural network models of the ventral stream. Together with prior work, these results argue that fast recurrent processing through the vPFC is critical to the production of behaviorally-sufficient object representations in IT.

}, doi = {https://doi.org/10.1101/2020.05.10.086959}, url = {https://www.biorxiv.org/content/10.1101/2020.05.10.086959v1}, author = {Kohitij Kar and James J. DiCarlo} } @article {141, title = {The inferior temporal cortex is a potential cortical precursor of orthographic processing in untrained monkeys}, journal = {Nature Communications}, volume = {11}, year = {2020}, month = {Jan-12-2020}, abstract = {

The ability to recognize written letter strings is foundational to human reading, but the underlying neuronal mechanisms remain largely unknown. Recent behavioral research in baboons suggests that non-human primates may provide an opportunity to investigate this question. We recorded the activity of hundreds of neurons in V4 and the inferior temporal cortex (IT) while na\ïve macaque monkeys passively viewed images of letters, English words and non-word strings, and tested the capacity of those neuronal representations to support a battery of orthographic processing tasks. We found that simple linear read-outs of IT (but not V4) population responses achieved high performance on all tested tasks, even matching the performance and error patterns of baboons on word classification. These results show that the IT cortex of untrained primates can serve as a precursor of orthographic processing, suggesting that the acquisition of reading in humans relies on the recycling of a brain network evolved for other visual functions.

}, doi = {10.1038/s41467-020-17714-3}, url = {http://www.nature.com/articles/s41467-020-17714-3}, author = {Rajalingham, Rishi and Kar, Kohitij and Sanghavi, Sachi and Dehaene, Stanislas and DiCarlo, James J.} } @article {147, title = {Integrative Benchmarking to Advance Neurally Mechanistic Models of Human Intelligence}, journal = {Neuron}, year = {2020}, month = {Jan-09-2020}, abstract = {

A potentially organizing goal of the brain and cognitive sciences is to accurately explain domains of human intelligence as executable, neurally mechanistic models. Years of research have led to models that capture experimental results in individual behavioral tasks and individual brain regions. We here advocate for taking the next step: integrating experimental results from many laboratories into suites of benchmarks that, when considered together, push mechanistic models toward explaining entire domains of intelligence, such as vision, language, and motor control. Given recent successes of neurally mechanistic models and the surging availability of neural, anatomical, and behavioral data, we believe that now is the time to create integrative benchmarking platforms that incentivize ambitious, unified models. This perspective discusses the advantages and the challenges of this approach and proposes specific steps to achieve this goal in the domain of visual intelligence with the case study of an integrative benchmarking platform called Brain-Score.

}, issn = {08966273}, doi = {10.1016/j.neuron.2020.07.040}, url = {https://linkinghub.elsevier.com/retrieve/pii/S089662732030605X}, author = {Schrimpf, Martin and Kubilius, Jonas and Lee, Michael J. and Murty, NAR and Ajemian, Robert and DiCarlo, James J.} } @article {152, title = {An Open Resource for Non-human Primate Optogenetics}, journal = {Neuron}, year = {2020}, month = {January 10, 2020}, type = {NeuroResource}, abstract = {

Optogenetics has revolutionized neuroscience in small laboratory animals, but its effect on animal models more closely related to humans, such as non-human primates (NHPs), has been mixed. To make evidence-based decisions in primate optogenetics, the scientific community would benefit from a centralized database listing all attempts, successful and unsuccessful, of using optogenetics in the primate brain. We contacted members of the community to ask for their contributions to an open science initiative. As of this writing, 45 laboratories around the world contributed more than 1,000 injection experiments, including precise details regarding their methods and outcomes. Of those entries, more than half had not been published. The resource is free for everyone to consult and contribute to on the Open Science Framework website. Here we review some of the insights from this initial release of the database and discuss methodological considerations to improve the success of optogenetic experiments in NHPs.

}, issn = {08966273}, doi = {10.1016/j.neuron.2020.09.027}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0896627320307510}, author = {Tremblay, Sebastien and Acker, Leah and Afraz, Arash and Albaugh, Daniel L. and Amita, Hidetoshi and Andrei, Ariana R. and Angelucci, Alessandra and Aschner, Amir and Balan, Puiu F. and Basso, Michele A. and Benvenuti, Giacomo and Bohlen, Martin O. and Caiola, Michael J. and Calcedo, Roberto and Cavanaugh, James and Chen, Yuzhi and Chen, Spencer and Chernov, Mykyta M. and Clark, Andrew M. and Dai, Ji and Debes, Samantha R. and Deisseroth, Karl and Desimone, Robert and Dragoi, Valentin and Egger, Seth W. and Eldridge, Mark A.G. and El-Nahal, Hala G. and Fabbrini, Francesco and Federer, Frederick and Fetsch, Christopher R. and Fortuna, Michal G. and Friedman, Robert M. and Fujii, Naotaka and Gail, Alexander and Galvan, Adriana and Ghosh, Supriya and Gieselmann, Marc Alwin and Gulli, Roberto A. and Hikosaka, Okihide and Hosseini, Eghbal A. and Hu, Xing and {\"u}er, Janina and Inoue, Ken-ichi and Janz, Roger and Jazayeri, Mehrdad and Jiang, Rundong and Ju, Niansheng and Kar, Kohitij and Klein, Carsten and Kohn, Adam and Komatsu, Misako and Maeda, Kazutaka and Martinez-Trujillo, Julio C. and Matsumoto, Masayuki and Maunsell, John H.R. and Mendoza-Halliday, Diego and Monosov, Ilya E. and Muers, Ross S. and Nurminen, Lauri and Ortiz-Rios, Michael and {\textquoteright}Shea, Daniel J. and Palfi, {\'e}phane and Petkov, Christopher I. and Pojoga, Sorin and Rajalingham, Rishi and Ramakrishnan, Charu and Remington, Evan D. and Revsine, Cambria and Roe, Anna W. and Sabes, Philip N. and Saunders, Richard C. and Scherberger, {\"o}rg and Schmid, Michael C. and Schultz, Wolfram and Seidemann, Eyal and Senova, Yann-Suhan and Shadlen, Michael N. and Sheinberg, David L. and Siu, Caitlin and Smith, Yoland and Solomon, Selina S. and Sommer, Marc A. and Spudich, John L. and Stauffer, William R. and Takada, Masahiko and Tang, Shiming and Thiele, Alexander and Treue, Stefan and Vanduffel, Wim and Vogels, Rufin and Whitmire, Matthew P. and Wichmann, Thomas and Wurtz, Robert H. and Xu, Haoran and Yazdan-Shahmorad, Azadeh and Shenoy, Krishna V. and DiCarlo, James J. and Platt, Michael L.} } @article {151, title = {Simulating a Primary Visual Cortex at the Front of CNNs Improves Robustness to Image Perturbations}, journal = {Neural Information Processing Systems (NeurIPS; spotlight)}, year = {2020}, month = {June 17, 2020}, type = {preprint}, abstract = {

Current state-of-the-art object recognition models are largely based on convolutional neural network (CNN) architectures, which are loosely inspired by the primate visual system. However, these CNNs can be fooled by imperceptibly small, explicitly crafted perturbations, and struggle to recognize objects in corrupted images that are easily recognized by humans. Here, by making comparisons with primate neural data, we first observed that CNN models with a neural hidden layer that better matches primate primary visual cortex (V1) are also more robust to adversarial attacks. Inspired by this observation, we developed VOneNets, a new class of hybrid CNN vision models. Each VOneNet contains a fixed weight neural network front-end that simulates primate V1, called the VOneBlock, followed by a neural network back-end adapted from current CNN vision models. The VOneBlock is based on a classical neuroscientific model of V1: the linear-nonlinear-Poisson model, consisting of a biologically-constrained Gabor filter bank, simple and complex cell nonlinearities, and a V1 neuronal stochasticity generator. After training, VOneNets retain high ImageNet performance, but each is substantially more robust, outperforming the base CNNs and state-of-the-art methods by 18\% and 3\%, respectively, on a conglomerate benchmark of perturbations comprised of white box adversarial attacks and common image corruptions. Finally, we show that all components of the VOneBlock work in synergy to improve robustness. While current CNN architectures are arguably brain-inspired, the results presented here demonstrate that more precisely mimicking just one stage of the primate visual system leads to new gains in ImageNet-level computer vision applications.

}, doi = {10.1101/2020.06.16.154542}, url = {https://www.biorxiv.org/content/10.1101/2020.06.16.154542v27}, author = {Dapello, Joel and Marques, Tiago and Schrimpf, Martin and Geiger, Franziska and Cox, David D and DiCarlo, James J} } @article {148, title = {ThreeDWorld: A Platform for Interactive Multi-Modal Physical Simulation}, journal = {arXiv}, year = {2020}, month = {July 9, 2020}, type = {preprint}, abstract = {

We introduce ThreeDWorld (TDW), a platform for interactive multi-modal physical simulation. With TDW, users can simulate high-fidelity sensory data and physical interactions between mobile agents and objects in a wide variety of rich 3D environments. TDW has several unique properties: 1) realtime near photo-realistic image rendering quality; 2) a library of objects and environments with materials for high-quality rendering, and routines enabling user customization of the asset library; 3) generative procedures for efficiently building classes of new environments 4) high-fidelity audio rendering; 5) believable and realistic physical interactions for a wide variety of material types, including cloths, liquid, and deformable objects; 6) a range of \"avatar\" types that serve as embodiments of AI agents, with the option for user avatar customization; and 7) support for human interactions with VR devices. TDW also provides a rich API enabling multiple agents to interact within a simulation and return a range of sensor and physics data representing the state of the world. We present initial experiments enabled by the platform around emerging research directions in computer vision, machine learning, and cognitive science, including multi-modal physical scene understanding, multi-agent interactions, models that \"learn like a child\", and attention studies in humans and neural networks. The simulation platform will be made publicly available.

}, url = {https://arxiv.org/abs/2007.04954}, author = {Gan, Chuang and Schwartz, Jeremy and Alter, Seth and Schrimpf, Martin and Traer, James and De Freitas, Julian and Kubilius, Jonas and Bhandwaldar, Abhishek and Haber, Nick and Sano, Megumi and Wang, Elias and Mrowca, Damian and Lingelbach, Michael and Curtis, Aidan and Figelis, Kevin and Bear, Daniel M. and Gutfreund, Dan and Cox, David and DiCarlo, James J. and McDermott, Josh and Tenenbaum, Joshua B. and Yamins, Daniel L.K.} } @article {5, title = {Topographic deep artificial neural networks reproduce the hallmarks of the primate inferior temporal cortex face processing network}, journal = {bioRxiv}, year = {2020}, month = {07/2020}, type = {preprint}, abstract = {

A salient characteristic of monkey inferior temporal (IT) cortex is the IT face processing network. Its hallmarks include: \“face neurons\” that respond more to faces than non-face objects, strong spatial clustering of those neurons in foci at each IT anatomical level (\“face patches\”), and the preferential interconnection of those foci. While some deep artificial neural networks (ANNs) are good predictors of IT neuronal responses, including face neurons, they do not explain those face network hallmarks. Here we ask if they might be explained with a simple, metabolically motivated addition to current ANN ventral stream models. Specifically, we designed and successfully trained topographic deep ANNs (TDANNs) to solve real-world visual recognition tasks (as in prior work), but, in addition, we also optimized each network to minimize a proxy for neuronal wiring length within its IT layers. We report that after this dual optimization, the model IT layers of TDANNs reproduce the hallmarks of the IT face network: the presence of face neurons, clusters of face neurons that quantitatively match those found in IT face patches, connectivity between those patches, and the emergence of face viewpoint invariance along the network hierarchy. We find that these phenomena emerge for a range of naturalistic experience, but not for highly unnatural training. Taken together, these results show that the IT face processing network could be a consequence of a basic hierarchical anatomy along the ventral stream, selection pressure on the visual system to accomplish general object categorization, and selection pressure to minimize axonal wiring length.

}, doi = {https://doi.org/10.1101/2020.07.09.185116}, url = {https://www.biorxiv.org/content/10.1101/2020.07.09.185116v1.full.pdf}, author = {Hyodong Lee and Eshed Margalit and Kamila M. Jozwik and Michael A. Cohen and Nancy Kanwisher and Daniel L. K. Yamins and James J. DiCarlo} } @article {7, title = {Unsupervised changes in core object recognition behavioral performance are accurately predicted by unsupervised neural plasticity in inferior temporal cortex}, journal = {BioRxiv}, year = {2020}, month = {01.2020}, type = {preprint}, abstract = {

Temporal continuity of object identity is a feature of natural visual input, and is potentially exploited -- in an unsupervised manner -- by the ventral visual stream to build the neural representation in inferior temporal (IT) cortex and IT-dependent core object recognition behavior. Here we investigated whether plasticity of individual IT neurons underlies human behavioral changes induced with unsupervised visual experience by building a single-neuron plasticity model combined with a previously established IT population-to-recognition-behavior linking model to predict human learning effects. We found that our model quite accurately predicted the mean direction, magnitude and time course of human performance changes. We also found a previously unreported dependency of the observed human performance change on the initial task difficulty. This result adds support to the hypothesis that tolerant core object recognition in human and non-human primates is instructed -- at least in part -- by naturally occurring unsupervised temporal contiguity experience.

}, doi = {https://doi.org/10.1101/2020.01.13.900837}, url = {https://www.biorxiv.org/content/10.1101/2020.01.13.900837v2.full.pdf}, author = {Xiaoxuan Jia and Ha Hong and James J. DiCarlo} } @article {150, title = {Unsupervised Neural Network Models of the Ventral Visual Stream}, journal = {bioRxiv}, year = {2020}, month = {June 18, 2020}, type = {preprint}, abstract = {

Deep neural networks currently provide the best quantitative models of the response patterns of neurons throughout the primate ventral visual stream. However, such networks have remained implausible as a model of the development of the ventral stream, in part because they are trained with supervised methods requiring many more labels than are accessible to infants during development. Here, we report that recent rapid progress in unsupervised learning has largely closed this gap. We find that neural network models learned with deep unsupervised contrastive embedding methods achieve neural prediction accuracy in multiple ventral visual cortical areas that equals or exceeds that of models derived using today\’s best supervised methods, and that the mapping of these neural network models\’ hidden layers is neuroanatomically consistent across the ventral stream. Moreover, we find that these methods produce brain-like representations even when trained on noisy and limited data measured from real children\’s developmental experience. We also find that semi-supervised deep contrastive embeddings can leverage small numbers of labelled examples to produce representations with substantially improved error-pattern consistency to human behavior. Taken together, these results suggest that deep contrastive embedding objectives may be a biologically-plausible computational theory of primate visual development.

}, doi = {10.1101/2020.06.16.155556}, url = {https://www.biorxiv.org/content/10.1101/2020.06.16.155556v1.abstract}, author = {Zhuang, Chengxu and Yan, Siming and Nayebi, Aran and Schrimpf, Martin and Frank, Michael and DiCarlo, James J. and Yamins, Daniel L.K.} } @conference {11, title = {Brain-Like Object Recognition with High-Performing Shallow Recurrent ANNs}, booktitle = {Neural Information Processing Systems}, year = {2019}, abstract = {

Deep convolutional artificial neural networks (ANNs) are the leading class of candidate models of the mechanisms of visual processing in the primate ventral stream. While initially inspired by brain anatomy, over the past years, these ANNs have evolved from a simple eight-layer architecture in AlexNet to extremely deep and branching architectures, demonstrating increasingly better object categorization performance, yet bringing into question how brain-like they still are. In particular, typical deep models from the machine learning community are often hard to map onto the brain\&$\#$39;s anatomy due to their vast number of layers and missing biologically-important connections, such as recurrence. Here we demonstrate that better anatomical alignment to the brain and high performance on machine learning as well as neuroscience measures do not have to be in contradiction. We developed CORnet-S, a shallow ANN with four anatomically mapped areas and recurrent connectivity, guided by Brain-Score, a new large-scale composite of neural and behavioral benchmarks for quantifying the functional fidelity of models of the primate ventral visual stream. Despite being significantly shallower than most models, CORnet-S is the top model on Brain-Score and outperforms similarly compact models on ImageNet. Moreover, our extensive analyses of CORnet-S circuitry variants reveal that recurrence is the main predictive factor of both Brain-Score and ImageNet top-1 performance. Finally, we report that the temporal evolution of the CORnet-S \"IT\" neural population resembles the actual monkey IT population dynamics. Taken together, these results establish CORnet-S, a compact, recurrent ANN, as the current best model of the primate ventral visual stream.

}, doi = {https://papers.nips.cc/paper/9441-brain-like-object-recognition-with-high-performing-shallow-recurrent-anns}, url = {https://papers.nips.cc/paper/9441-brain-like-object-recognition-with-high-performing-shallow-recurrent-anns.pdf}, author = {Jonas Kubilius and Martin Schrimpf and Ha Hong and Najib Majaj and Rajalingham, Rishi and Issa, Elias B. and Kohitij Kar and Bashivan, Pouya and Jonathan Prescott-Roy and Kailyn Schmidt and Aran Nayebi and Daniel Bear and Daniel L. K. Yamins and James J. DiCarlo} } @article {168, title = {Comparing novel object learning in humans, models, and monkeys}, journal = {Journal of Vision}, volume = {19}, year = {2019}, month = {Jun-09-2019}, pages = {114b}, abstract = {

Humans readily learn to identify novel objects, and it has been hypothesized that plasticity in visual cortex supports this behavior. Contributing to this view are reports of experience-driven changes in the properties of neurons at many levels of visual cortex, from V1 to inferotemporal cortex (IT). Here, we ask if object learning might instead be explained by a simple model in which a static set of IT-like visual features is followed by a perceptron learner. Specifically, we measured human (268 subjects; 170,000+ trials) and nonhuman primate (NHP; 2 subjects, 300,000+ trials) behavior across a battery of 29 visuomotor association tasks that each required the subject to learn to discriminate between a pair of synthetically-generated, never-before-seen 3D objects (58 distinct objects). Objects were rendered at varying scales, positions, and rotations; superimposed on naturalistic backgrounds; and presented for 200 msec. We then approximated the visual system\’s IT response to each image using models of ventral stream processing (i.e. specific deep neural networks trained on ImageNet categorization), and we applied a reward-based, perceptron learner to the static set of features produced at the penultimate layer of each model. We report that our model is sufficient to explain both human and NHP rates of learning on these tasks. Additionally, we show humans, NHPs, and this model share the same pattern of performance over objects, but that NHPs reach criterion performance ~10\× as slowly as humans (human t = 139, NHP t = 1149), suggesting humans have similar but more rapid learning mechanisms than their NHP cousins in this domain. Taken together, these results suggest the possibility that object learning is mediated by plasticity in a small population of \“readout\” neurons that learn and execute weighted sums of activity across an upstream sensory population representation (IT) that is largely stable.

}, issn = {1534-7362}, doi = {10.1167/19.10.114b}, url = {https://jov.arvojournals.org/article.aspx?articleid=2750359}, author = {Lee, Michael J and DiCarlo, James J} } @article {4, title = {Evidence that recurrent circuits are critical to the ventral stream{\textquoteright}s execution of core object recognition behavior}, journal = {Nature Neuroscience}, volume = {22}, year = {2019}, month = {01/2019}, pages = {974 - 983}, issn = {1097-6256}, doi = {10.1038/s41593-019-0392-5}, url = {http://www.nature.com/articles/s41593-019-0392-5}, author = {Kar, Kohitij and Kubilius, Jonas and Schmidt, Kailyn and Issa, Elias B. and DiCarlo, James J.} } @inbook {12, title = {Funcitional Properties of Circuits, Cellular Populations, and Areas}, booktitle = {The Neocortex}, volume = {27}, year = {2019}, pages = {223-265}, publisher = {The MIT Press}, organization = {The MIT Press}, chapter = {13}, address = {Cambridge, MA}, issn = {978-0-262-04324-3}, doi = {10.7551/mitpress/12593.001.0001}, url = {http://www.esforum.de}, author = {Harris, KD and Groh, JM and DiCarlo, JJ and Fries, P and Kaschube, M and Laurent, G and MacLean, JN and McCormick, DA and Pipa, G and Reynolds, JN and Schwartz, AB and Sejnowksi, TJ and Singer, W and Vinck, M}, editor = {Singer, Wolf and Sejnowski, Terrence J. and Rakic, Pasko} } @article {8, title = {Neural population control via deep image synthesis}, journal = {Science}, volume = {364}, year = {2019}, month = {03/2019}, pages = {eaav9436}, abstract = {

Particular deep artificial neural networks (ANNs) are today\&$\#$39;s most accurate models of the primate brain\&$\#$39;s ventral visual stream. Using an ANN-driven image synthesis method, we found that luminous power patterns (i.e., images) can be applied to primate retinae to predictably push the spiking activity of targeted V4 neural sites beyond naturally occurring levels. This method, although not yet perfect, achieves unprecedented independent control of the activity state of entire populations of V4 neural sites, even those with overlapping receptive fields. These results show how the knowledge embedded in today\&$\#$39;s ANN models might be used to noninvasively set desired internal brain states at neuron-level resolution, and suggest that more accurate ANN models would produce even more accurate control.

}, issn = {0036-8075}, doi = {10.1126/science.aav9436}, url = {http://science.sciencemag.org/cgi/rapidpdf/364/6439/eaav9436?ijkey=iBRdlniG7iYuA\&keytype=ref\&siteid=sci}, author = {Bashivan, Pouya and Kar, Kohitij and DiCarlo, James J.} } @article {10, title = {Reversible Inactivation of Different Millimeter-Scale Regions of Primate IT Results in Different Patterns of Core Object Recognition Deficits}, journal = {Neuron}, volume = {102}, year = {2019}, month = {01/2019}, pages = {493 - 505.e5}, abstract = {

Extensive research suggests that the inferior temporal (IT) population supports visual object recognition\ behavior. However, causal evidence for this hypothesis has been equivocal, particularly beyond the specific case of face-selective subregions of IT. Here, we\ directly tested this hypothesis by pharmacologically inactivating individual, millimeter-scale subregions of IT while monkeys performed several core object recognition subtasks, interleaved trial-by trial. First,\ we observed that IT inactivation resulted in reliable contralateral-biased subtask-selective\ behavioral deficits. Moreover, inactivating different IT subregions resulted in different patterns of subtask deficits, predicted by each subregion\’s neuronal object discriminability. Finally, the similarity between different inactivation effects was tightly related to the\ anatomical distance between corresponding inactivation sites. Taken together, these results provide direct evidence that the IT cortex causally supports general core object recognition and that the underlying IT coding dimensions are topographically organized.

}, issn = {08966273}, doi = {10.1016/j.neuron.2019.02.001}, url = {https://www.cell.com/neuron/pdfExtended/S0896-6273(19)30110-2}, author = {Rajalingham, Rishi and DiCarlo, James J.} } @article {9, title = {To find better neural network models of human vision, find better neural network models of primate vision}, journal = {BioRxiv}, year = {2019}, month = {07/2019}, type = {preprint}, abstract = {

Specific deep artificial neural networks (ANNs) are the current best models of ventral visual processing and object recognition behavior in monkeys. We here explore whether models of non-human primate vision generalize to visual processing in the human primate brain. Specifically, we asked if model match to monkey IT is a predictor of model match to human IT, even when scoring those matches on different images. We found that the model match to monkey IT is a positive predictor of the model match to human IT (R = 0.36), and that this approach outperforms the current standard predictor of model accuracy on ImageNet. This suggests a more powerful approach for pre-selecting models as hypotheses of human brain processing.

}, doi = {https://doi.org/10.1101/688390}, url = {https://www.biorxiv.org/content/10.1101/688390v1.full.pdf}, author = {Kamila M. Jozwik and Martin Schrimpf and Nancy Kanwisher and James J. DiCarlo} } @conference {166, title = {Using Brain-Score to Evaluate and Build Neural Networks for Brain-Like Object Recognition}, booktitle = {Computational and Systems Neuroscience (COSYNE)}, year = {2019}, address = {Denver, CO}, author = {Schrimpf, Martin and Kubilius, Jonas and Hong, Ha and Majaj, Najib and Rajalingham, Rishi and Issa, Elias B and Kar, Kohitij and Ziemba, Corey M and Bashivan, Pouya and Prescott-Roy, Jonathan and Schmidt, Kailyn and Yamins, Daniel LK and DiCarlo, James J} } @article {22, title = {Brain-Score: Which Artificial Neural Network for Object Recognition is most Brain-Like?}, journal = {bioRxiv}, year = {2018}, month = {09/2018}, type = {preprint}, abstract = {

The internal representations of early deep artificial neural networks (ANNs) were found to be remarkably similar to the internal neural representations measured experimentally in the primate brain. Here we ask, as deep ANNs have continued to evolve, are they becoming more or less brain-like? ANNs that are most functionally similar to the brain will contain mechanisms that are most like those used by the brain. We therefore developed Brain-Score - a composite of multiple neural and behavioral benchmarks that score any ANN on how similar it is to the brain\&$\#$39;s mechanisms for core object recognition - and we deployed it to evaluate a wide range of state-of-the-art deep ANNs. Using this scoring system, we here report that: (1) DenseNet-169, CORnet-S and ResNet-101 are the most brain-like ANNs. (2) There remains considerable variability in neural and behavioral responses that is not predicted by any ANN, suggesting that no ANN model has yet captured all the relevant mechanisms. (3) Extending prior work, we found that gains in ANN ImageNet performance led to gains on Brain-Score. However, correlation weakened at \>= 70\% top-1 ImageNet performance, suggesting that additional guidance from neuroscience is needed to make further advances in capturing brain mechanisms. (4) We uncovered smaller (i.e. less complex) ANNs that are more brain-like than many of the best-performing ImageNet models, which suggests the opportunity to simplify ANNs to better understand the ventral stream. The scoring system used here is far from complete. However, we propose that evaluating and tracking model-benchmark correspondences through a Brain-Score that is regularly updated with new brain data is an exciting opportunity: experimental benchmarks can be used to guide machine network evolution, and machine networks are mechanistic hypotheses of the brain\&$\#$39;s network and thus drive next experiments. To facilitate both of these, we release Brain-Score.org: a platform that hosts the neural and behavioral benchmarks, where ANNs for visual processing can be submitted to receive a Brain-Score and their rank relative to other models, and where new experimental data can be naturally incorporated.

}, doi = {https://doi.org/10.1101/407007}, url = {https://www.biorxiv.org/content/10.1101/407007v2.full.pdf}, author = {Martin Schrimpf and Kubilius, Jonas and Ha Hong and Najib Majaj and Rajalingham, Rishi and Issa, Elias B. and Kar, Kohitij and Bashivan, Pouya and Jonathan Prescott-Roy and Schmidt, Kailyn and Daniel L. K. Yamins and DiCarlo, James J.} } @article {23, title = {CORnet: Modeling the Neural Mechanisms of Core Object Recognition}, journal = {bioRxiv}, year = {2018}, month = {09/2018}, type = {preprint}, abstract = {

Deep artificial neural networks with spatially repeated processing (a.k.a., deep convolutional ANNs) have been established as the best class of candidate models of visual processing in primate ventral visual processing stream. Over the past five years, these ANNs have evolved from a simple feedforward eight-layer architecture in AlexNet to extremely deep and branching NASNet architectures, demonstrating increasingly better object categorization performance and increasingly better explanatory power of both neural and behavioral responses. However, from the neuroscientist\&$\#$39;s point of view, the relationship between such very deep architectures and the ventral visual pathway is incomplete in at least two ways. On the one hand, current state-of-the-art ANNs appear to be too complex (e.g., now over 100 levels) compared with the relatively shallow cortical hierarchy (4-8 levels), which makes it difficult to map their elements to those in the ventral visual stream and to understand what they are doing. On the other hand, current state-of-the-art ANNs appear to be not complex enough in that they lack recurrent connections and the resulting neural response dynamics that are commonplace in the ventral visual stream. Here we describe our ongoing efforts to resolve both of these issues by developing a \"CORnet\" family of deep neural network architectures. Rather than just seeking high object recognition performance (as the state-of-the-art ANNs above), we instead try to reduce the model family to its most important elements and then gradually build new ANNs with recurrent and skip connections while monitoring both performance and the match between each new CORnet model and a large body of primate brain and behavioral data. We report here that our current best ANN model derived from this approach (CORnet-S) is among the top models on Brain-Score, a composite benchmark for comparing models to the brain, but is simpler than other deep ANNs in terms of the number of convolutions performed along the longest path of information processing in the model. All CORnet models are available at\ https://github.com/dicarlolab/CORnet, and we plan to update this manuscript and the available models in this family as they are produced.

}, doi = {https://doi.org/10.1101/408385}, url = {https://www.biorxiv.org/content/10.1101/408385v1.full.pdf}, author = {Kubilius, Jonas and Martin Schrimpf and Aran Nayebi and Daniel Bear and Daniel L. K. Yamins and DiCarlo, James J.} } @article {16, title = {Deep learning reaches the motor system}, journal = {Nature Methods}, volume = {15}, year = {2018}, month = {Jan-10-2018}, pages = {772 - 773}, type = {News and Views}, abstract = {

A new article by Pandarinath et al. describes an artificial neural network model that captures some key aspects of the activity of populations of neurons in the primary motor cortex.

}, issn = {1548-7091}, doi = {10.1038/s41592-018-0152-6}, url = {https://doi.org/10.1038/s41592-018-0152-6}, author = {Batista, Aaron P. and DiCarlo, James J.} } @article {19, title = {Evidence that recurrent circuits are critical to the ventral stream{\textquoteright}s execution of core object recognition behavior}, journal = {bioRxiv}, year = {2018}, month = {06/2018}, type = {preprint}, abstract = {

Non-recurrent deep convolutional neural networks (DCNNs) are currently the best models of core object recognition; a behavior supported by the densely recurrent primate ventral stream, culminating in the inferior temporal (IT) cortex. Are these recurrent circuits critical to ventral stream\&$\#$39;s execution of this behavior? We reasoned that, if recurrence is critical, then primates should outperform feedforward-only DCNNs for some images, and that these images should require additional processing time beyond the feedforward IT response. Here we first used behavioral methods to discover hundreds of these \"challenge\" images. Second, using large-scale IT electrophysiology in animals performing core recognition tasks, we observed that behaviorally-sufficient, linearly-decodable object identity solutions emerged ~30ms (on average) later in IT for challenge images compared to DCNN and primate performance-matched \"control\" images. We observed these same late solutions even during passive viewing. Third, consistent with a failure of feedforward computations, the behaviorally-critical late-phase IT population response patterns evoked by the challenge images were poorly predicted by DCNN activations. Interestingly, deeper CNNs better predicted these late IT responses, suggesting a functional equivalence between recurrence and additional nonlinear transformations. Our results argue that automatically-evoked recurrent circuits are critical even for rapid object identification. By precisely comparing current DCNNs, primate behavior and IT population dynamics, we provide guidance for future recurrent model development.

}, doi = {https://doi.org/10.1101/354753}, url = {https://www.biorxiv.org/content/10.1101/354753v1.full.pdf}, author = {Kar, Kohitij and Kubilius, Jonas and Schmidt, Kailyn and Issa, Elias B and DiCarlo, James J.} } @article {13, title = {Large-Scale, High-Resolution Comparison of the Core Visual Object Recognition Behavior of Humans, Monkeys, and State-of-the-Art Deep Artificial Neural Networks}, journal = {The Journal of Neuroscience}, volume = {38}, year = {2018}, month = {03/2019}, pages = {7255 - 7269}, abstract = {

Primates-including humans-can typically recognize objects in visual images at a glance even in spite of naturally occurring identity-preserving image transformations (e.g. changes in viewpoint). A primary neuroscience goal is to uncover neuron-level mechanistic models that quantitatively explain this behavior by predicting primate performance for each and every image. Here, we applied this stringent behavioral prediction test to the leading mechanistic models of primate vision (specifically, deep, convolutional, artificial neural networks; ANNs) by directly comparing their behavioral signatures against those of humans and rhesus macaque monkeys. Using high-throughput data collection systems for human and monkey psychophysics, we collected over one million behavioral trials from 1472 anonymous humans and five male macaque monkeys for 2400 images over 276 binary object discrimination tasks. Consistent with previous work, we observed that state-of-the-art deep, feed-forward convolutional ANNs trained for visual categorization (termed DCNN models) accurately predicted primate patterns of object-level confusion. However, when we examined behavioral performance for individual images within each object discrimination task, we found that all tested DCNN models were significantly non-predictive of primate performance, and that this prediction failure was not accounted for by simple image attributes, nor rescued by simple model modifications. These results show that current DCNN models cannot account for the image-level behavioral patterns of primates, and that new ANN models are needed to more precisely capture the neural mechanisms underlying primate object vision. To this end, large-scale, high-resolution primate behavioral benchmarks-such as those obtained here-could serve as direct guides for discovering such models.Recently, specific feed-forward deep convolutional artificial neural networks (ANNs) models have dramatically advanced our quantitative understanding of the neural mechanisms underlying primate core object recognition. In this work, we tested the limits of those ANNs by systematically comparing the behavioral responses of these models with the behavioral responses of humans and monkeys, at the resolution of individual images. Using these high-resolution metrics, we found that all tested ANN models significantly diverged from primate behavior. Going forward, these high-resolution, large-scale primate behavioral benchmarks could serve as direct guides for discovering better ANN models of the primate visual system.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.0388-18.2018}, url = {http://www.jneurosci.org/content/38/33/7255}, author = {Rajalingham, Rishi and Issa, Elias B. and Bashivan, Pouya and Kar, Kohitij and Schmidt, Kailyn and DiCarlo, James J.} } @article {21, title = {Large-scale, high-resolution comparison of the core visual object recognition behavior of humans, monkeys, and state-of-the-art deep artificial neural networks}, journal = {bioRxiv}, year = {2018}, month = {02/2018}, type = {preprint}, abstract = {

Primates\—including humans\—can typically recognize objects in visual images at a glance even in the face of naturally occurring identity-preserving image transformations (e.g. changes in viewpoint). A primary neuroscience goal is to uncover neuron-level mechanistic models that quantitatively explain this behavior by predicting primate performance for each and every image. Here, we applied this stringent behavioral prediction test to the leading mechanistic models of primate vision (specifically, deep, convolutional, artificial neural networks; ANNs) by directly comparing their behavioral signatures against those of humans and rhesus macaque monkeys. Using high-throughput data collection systems for human and monkey psychophysics, we collected over one million behavioral trials for 2400 images over 276 binary object discrimination tasks. Consistent with previous work, we observed that state-of-the-art deep, feed-forward convolutional ANNs trained for visual categorization (termed DCNNIC\ models) accurately predicted primate patterns of object-level confusion. However, when we examined behavioral performance for individual images within each object discrimination task, we found that all tested DCNNIC\ models were significantly non-predictive of primate performance, and that this prediction failure was not accounted for by simple image attributes, nor rescued by simple model modifications. These results show that current DCNNIC\ models cannot account for the image-level behavioral patterns of primates, and that new ANN models are needed to more precisely capture the neural mechanisms underlying primate object vision. To this end, large-scale, high-resolution primate behavioral benchmarks\—such as those obtained here\—could serve as direct guides for discovering such models.

}, doi = {https://doi.org/10.1101/240614}, url = {https://www.biorxiv.org/content/10.1101/240614v4.full.pdf}, author = {Rajalingham, Rishi and Issa, Elias B and Bashivan, Pouya and Kar, Kohitij and Schmidt, Kailyn and DiCarlo, James J.} } @article {14, title = {Minimally invasive multimode optical fiber microendoscope for deep brain fluorescence imaging}, journal = {Biomedical Optics Express}, volume = {9}, year = {2018}, month = {01/2018}, pages = {1492-1509}, abstract = {

A major open challenge in neuroscience is the ability to measure and perturb neural activity in vivo from well defined neural sub-populations at cellular resolution anywhere in the brain. However, limitations posed by scattering and absorption prohibit non-invasive multi-photon approaches for deep (\>2mm) structures, while gradient refractive index (GRIN) endoscopes are relatively thick and can cause significant damage upon insertion. Here, we present a novel micro-endoscope design to image neural activity at arbitrary depths via an ultra-thin multi-mode optical fiber (MMF) probe that has 5-10X thinner diameter than commercially available micro-endoscopes. We demonstrate micron-scale resolution, multi-spectral and volumetric imaging. In contrast to previous approaches, we show that this method has an improved acquisition speed that is sufficient to capture rapid neuronal dynamics in-vivo in rodents expressing a genetically encoded calcium indicator (GCaMP). Our results emphasize the potential of this technology in neuroscience applications and open up possibilities for cellular resolution imaging in previously unreachable brain regions.

}, issn = {2156-7085}, doi = {10.1364/BOE.9.001492}, url = {https://www.osapublishing.org/abstract.cfm?URI=boe-9-4-1492}, author = {Ohayon, Shay and Caravaca-Aguirre, Antonio and Piestun, Rafael and DiCarlo, James J.} } @article {15, title = {Neural dynamics at successive stages of the ventral visual stream are consistent with hierarchical error signals.}, journal = {eLife}, volume = {7}, year = {2018}, month = {11/2018}, abstract = {

Ventral visual stream neural responses are dynamic, even for static image presentations. However, dynamical neural models of visual cortex are lacking as most progress has been made modeling static, time-averaged responses. Here, we studied population neural dynamics during face detection across three cortical processing stages. Remarkably,~30 milliseconds after the initially evoked response, we found that neurons in intermediate level areas decreased their responses to typical configurations of their preferred face parts relative to their response for atypical configurations even while neurons in higher areas achieved and maintained a preference for typical configurations. These hierarchical neural dynamics were inconsistent with standard feedforward circuits. Rather, recurrent models computing prediction errors between stages captured the observed temporal signatures. This model of neural dynamics, which simply augments the standard feedforward model of online vision, suggests that neural responses to static images may encode top-down prediction errors in addition to bottom-up feature estimates.

}, keywords = {Animals, Brain Mapping, Face, Humans, Macaca mulatta, Models, Neurological, Neurons, Pattern Recognition, Photic Stimulation, Reaction Time, Visual, Visual Cortex, Visual Perception}, issn = {2050-084X}, doi = {10.7554/eLife.42870}, url = {https://elifesciences.org/articles/42870https://cdn.elifesciences.org/articles/42870/elife-42870-v2.pdf}, author = {Issa, Elias B and Cadieu, Charles F and DiCarlo, James J} } @article {17, title = {Reversible inactivation of different millimeter-scale regions of primate IT results in different patterns of core object recognition deficits}, journal = {bioRxiv}, year = {2018}, month = {08/2018}, type = {preprint}, abstract = {

Extensive research suggests that the inferior temporal (IT) population supports visual object recognition behavior. However, causal evidence for this hypothesis has been equivocal, particularly beyond the specific case of face-selective sub-regions of IT. Here, we directly tested this hypothesis by pharmacologically inactivating individual, millimeter-scale sub-regions of IT while monkeys performed several object discrimination tasks, interleaved trial-by-trial. First, we observed that IT inactivation resulted in reliable contralateral-biased task-selective behavioral deficits. Moreover, inactivating different IT sub-regions resulted in different patterns of task deficits, each predicted by that sub-region\&$\#$39;s neuronal object discriminability. Finally, the similarity between different inactivation effects was tightly related to the anatomical distance between corresponding inactivation sites. Taken together, these results provide direct evidence that IT cortex causally supports general core object recognition, and that the underlying IT codes are topographically organized.

}, doi = {https://doi.org/10.1101/390245}, url = {https://www.biorxiv.org/content/10.1101/390245v1.full.pdf}, author = {Rajalingham, Rishi and DiCarlo, James J.} } @article {20, title = {Task-Driven Convolutional Recurrent Models of the Visual System}, journal = {arXiv}, year = {2018}, month = {06/2018}, type = {preprint}, abstract = {

Feed-forward convolutional neural networks (CNNs) are currently state-of-the-art for object classification tasks such as ImageNet. Further, they are quantitatively accurate models of temporally-averaged responses of neurons in the primate brain\&$\#$39;s visual system. However, biological visual systems have two ubiquitous architectural features not shared with typical CNNs: local recurrence within cortical areas, and long-range feedback from downstream areas to upstream areas. Here we explored the role of recurrence in improving classification performance. We found that standard forms of recurrence (vanilla RNNs and LSTMs) do not perform well within deep CNNs on the ImageNet task. In contrast, custom cells that incorporated two structural features, bypassing and gating, were able to boost task accuracy substantially. We extended these design principles in an automated search over thousands of model architectures, which identified novel local recurrent cells and long-range feedback connections useful for object recognition. Moreover, these task-optimized ConvRNNs explained the dynamics of neural activity in the primate visual system better than feedforward networks, suggesting a role for the brain\&$\#$39;s recurrent connections in performing difficult visual behaviors.

}, doi = {https://arxiv.org/abs/1807.00053}, url = {https://arxiv.org/pdf/1807.00053.pdf}, author = {Aran Nayebi and Daniel Bear and Kubilius, Jonas and Ganguli, S and Sussillo, D and DiCarlo, James J. and Yamins, DLK} } @article {18, title = {Teacher Guided Architecture Search}, journal = {arXiv}, year = {2018}, month = {04/2018}, type = {preprint}, abstract = {

Much of the recent improvement in neural networks for computer vision has resulted from discovery of new networks architectures. Most prior work has used the performance of candidate models following limited training to automatically guide the search in a feasible way. Could further gains in computational efficiency be achieved by guiding the search via measurements of a high performing network with unknown detailed architecture (e.g. the primate visual system)? As one step toward this goal, we use representational similarity analysis to evaluate the similarity of internal activations of candidate networks with those of a (fixed, high performing) teacher network. We show that adopting this evaluation metric could produce up to an order of magnitude in search efficiency over performance-guided methods. Our approach finds a convolutional cell structure with similar performance as was previously found using other methods but at a total computational cost that is two orders of magnitude lower than Neural Architecture Search (NAS) and more than four times lower than progressive neural architecture search (PNAS). We further show that measurements from only ~300 neurons from primate visual system provides enough signal to find a network with an Imagenet top-1 error that is significantly lower than that achieved by performance-guided architecture search alone. These results suggest that representational matching can be used to accelerate network architecture search in cases where one has access to some or all of the internal representations of a teacher network of interest, such as the brain\&$\#$39;s sensory processing networks.

}, doi = {https://arxiv.org/abs/1808.01405}, url = {https://arxiv.org/pdf/1808.01405.pdf}, author = {Bashivan, Pouya and Tensen, Mark and DiCarlo, James J.} } @article {25, title = {Eight open questions in the computational modeling of higher sensory cortex}, journal = {Current Opinion in Neurobiology}, volume = {37}, year = {2016}, month = {01/2016}, pages = {114 - 120}, abstract = {

Propelled by advances in biologically inspired computer vision and artificial intelligence, the past five years have seen significant progress in using deep neural networks to model response patterns of neurons in visual cortex. In this paper, we briefly review this progress and then discuss eight key \&$\#$39;open questions\&$\#$39; that we believe will drive research in computational models of sensory systems over the next five years, both in visual cortex and beyond.

}, issn = {1873-6882}, doi = {10.1016/j.conb.2016.02.001}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438816300022}, author = {Yamins, Daniel LK and DiCarlo, James J} } @article {27, title = {Explicit information for category-orthogonal object properties increases along the ventral stream}, journal = {Nature Neuroscience}, volume = {19}, year = {2016}, month = {02/2016}, pages = {613 - 622}, abstract = {

Extensive research has revealed that the ventral visual stream hierarchically builds a robust representation for supporting visual object categorization tasks. We systematically explored the ability of multiple ventral visual areas to support a variety of \&$\#$39;category-orthogonal\&$\#$39; object properties such as position, size and pose. For complex naturalistic stimuli, we found that the inferior temporal (IT) population encodes all measured category-orthogonal object properties, including those properties often considered to be low-level features (for example, position), more explicitly than earlier ventral stream areas. We also found that the IT population better predicts human performance patterns across properties. A hierarchical neural network model based on simple computational principles generates these same cross-area patterns of information. Taken together, our empirical results support the hypothesis that all behaviorally relevant object properties are extracted in concert up the ventral visual hierarchy, and our computational model explains how that hierarchy might be built.

}, issn = {1097-6256}, doi = {10.1038/nn.4247}, url = {http://www.nature.com/articles/nn.4247}, author = {Hong, Ha and Yamins, Daniel L K and Majaj, Najib J and DiCarlo, James J} } @article {24, title = {Neurophysiological Organization of the Middle Face Patch in Macaque Inferior Temporal Cortex}, journal = {The Journal of Neuroscience}, volume = {36}, year = {2016}, month = {02/2017}, pages = {12729 - 12745}, abstract = {

While early cortical visual areas contain fine scale spatial organization of neuronal properties, such as orientation preference, the spatial organization of higher-level visual areas is less well understood. The fMRI demonstration of face-preferring regions in human ventral cortex and monkey inferior temporal cortex (\“face patches\”) raises the question of how neural selectivity for faces is organized. Here, we targeted hundreds of spatially registered neural recordings to the largest fMRI-identified face-preferring region in monkeys, the middle face patch (MFP), and show that the MFP contains a graded enrichment of face-preferring neurons. At its center, as much as 93\% of the sites we sampled responded twice as strongly to faces than to nonface objects. We estimate the maximum neurophysiological size of the MFP to be \∼6 mm in diameter, consistent with its previously reported size under fMRI. Importantly, face selectivity in the MFP varied strongly even between neighboring sites. Additionally, extremely face-selective sites were \∼40 times more likely to be present inside the MFP than outside. These results provide the first direct quantification of the size and neural composition of the MFP by showing that the cortical tissue localized to the fMRI defined region consists of a very high fraction of face-preferring sites near its center, and a monotonic decrease in that fraction along any radial spatial axis.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.0237-16.2016}, url = {https://www.jneurosci.org/content/jneuro/36/50/12729.full.pdf}, author = {Aparicio, Paul L. and Issa, Elias B. and DiCarlo, James J.} } @article {26, title = {Using goal-driven deep learning models to understand sensory cortex}, journal = {Nature Neuroscience}, volume = {19}, year = {2016}, month = {01/2016}, pages = {356 - 365}, abstract = {

Fueled by innovation in the computer vision and artificial intelligence communities, recent developments in computational neuroscience have used goal-driven hierarchical convolutional neural networks (HCNNs) to make strides in modeling neural single-unit and population responses in higher visual cortical areas. In this Perspective, we review the recent progress in a broader modeling context and describe some of the key technical innovations that have supported it. We then outline how the goal-driven HCNN approach can be used to delve even more deeply into understanding the development and organization of sensory cortical processing.

}, issn = {1097-6256}, doi = {10.1038/nn.4244}, url = {http://www.nature.com/articles/nn.4244.pdf}, author = {Yamins, Daniel L K and DiCarlo, James J} } @article {29, title = {Comparison of Object Recognition Behavior in Human and Monkey}, journal = {Journal of Neuroscience}, volume = {35}, year = {2015}, month = {02/2015}, pages = {12127 - 12136}, abstract = {

Although the rhesus monkey is used widely as an animal model of human visual processing, it is not known whether invariant visual object recognition behavior is quantitatively comparable across monkeys and humans. To address this question, we systematically compared the core object recognition behavior of two monkeys with that of human subjects. To test true object recognition behavior (rather than image matching), we generated several thousand naturalistic synthetic images of 24 basic-level objects with high variation in viewing parameters and image background. Monkeys were trained to perform binary object recognition tasks on a match-to-sample paradigm. Data from 605 human subjects performing the same tasks on Mechanical Turk were aggregated to characterize \"pooled human\" object recognition behavior, as well as 33 separate Mechanical Turk subjects to characterize individual human subject behavior. Our results show that monkeys learn each new object in a few days, after which they not only match mean human performance but show a pattern of object confusion that is highly correlated with pooled human confusion patterns and is statistically indistinguishable from individual human subjects. Importantly, this shared human and monkey pattern of 3D object confusion is not shared with low-level visual representations (pixels, V1+; models of the retina and primary visual cortex) but is shared with a state-of-the-art computer vision feature representation. Together, these results are consistent with the hypothesis that rhesus monkeys and humans share a common neural shape representation that directly supports object perception.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.0573-15.2015}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.0573-15.2015}, author = {Rajalingham, R. and Schmidt, K. and DiCarlo, J. J.} } @article {30, title = {Optogenetic and pharmacological suppression of spatial clusters of face neurons reveal their causal role in face gender discrimination}, journal = {Proceedings of the National Academy of Sciences}, year = {2015}, month = {05/2015}, pages = {6730 - 6735}, abstract = {

Neurons that respond more to images of faces over nonface objects were identified in the inferior temporal (IT) cortex of primates three decades ago. Although it is hypothesized that perceptual discrimination between faces depends on the neural activity of IT subregions enriched with \“face neurons,\” such a causal link has not been directly established. Here, using optogenetic and pharmacological methods, we reversibly suppressed the neural activity in small subregions of IT cortex of macaque monkeys performing a facial gender-discrimination task. Each type of intervention independently demonstrated that suppression of IT subregions enriched in face neurons induced a contralateral deficit in face gender-discrimination behavior. The same neural suppression of other IT subregions produced no detectable change in behavior. These results establish a causal link between the neural activity in IT face neuron subregions and face gender-discrimination behavior. Also, the demonstration that brief neural suppression of specific spatial subregions of IT induces behavioral effects opens the door for applying the technical advantages of optogenetics to a systematic attack on the causal relationship between IT cortex and high-level visual perception.

}, issn = {0027-8424}, doi = {10.1073/pnas.1423328112}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1423328112}, author = {Afraz, Arash and Boyden, Edward S. and DiCarlo, James J.} } @article {28, title = {Simple Learned Weighted Sums of Inferior Temporal Neuronal Firing Rates Accurately Predict Human Core Object Recognition Performance}, journal = {Journal of Neuroscience}, volume = {35}, year = {2015}, month = {06/2017}, pages = {13402 - 13418}, abstract = {

To go beyond qualitative models of the biological substrate of object recognition, we ask: can a single ventral stream neuronal linking hypothesis quantitatively account for core object recognition performance over a broad range of tasks? We measured human performance in 64 object recognition tests using thousands of challenging images that explore shape similarity and identity preserving object variation. We then used multielectrode arrays to measure neuronal population responses to those same images in visual areas V4 and inferior temporal (IT) cortex of monkeys and simulated V1 population responses. We tested leading candidate linking hypotheses and control hypotheses, each postulating how ventral stream neuronal responses underlie object recognition behavior. Specifically, for each hypothesis, we computed the predicted performance on the 64 tests and compared it with the measured pattern of human performance. All tested hypotheses based on low- and mid-level visually evoked activity (pixels, V1, and V4) were very poor predictors of the human behavioral pattern. However, simple learned weighted sums of distributed average IT firing rates exactly predicted the behavioral pattern. More elaborate linking hypotheses relying on IT trial-by-trial correlational structure, finer IT temporal codes, or ones that strictly respect the known spatial substructures of IT (\"face patches\") did not improve predictive power. Although these results do not reject those more elaborate hypotheses, they suggest a simple, sufficient quantitative model: each object recognition task is learned from the spatially distributed mean firing rates (100 ms) of \∼60,000 IT neurons and is executed as a simple weighted sum of those firing rates.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.5181-14.2015}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.5181-14.2015}, author = {Majaj, N. J. and Hong, H. and Solomon, E. A. and DiCarlo, J. J.} } @article {32, title = {Deep Neural Networks Rival the Representation of Primate IT Cortex for Core Visual Object Recognition}, journal = {PLoS Computational Biology}, volume = {10}, year = {2014}, month = {12/2014}, pages = {e1003963}, abstract = {

The primate visual system achieves remarkable visual object recognition performance even in brief presentations, and under changes to object exemplar, geometric transformations, and background variation (a.k.a. core visual object recognition). This remarkable performance is mediated by the representation formed in inferior temporal (IT) cortex. In parallel, recent advances in machine learning have led to ever higher performing models of object recognition using artificial deep neural networks (DNNs). It remains unclear, however, whether the representational performance of DNNs rivals that of the brain. To accurately produce such a comparison, a major difficulty has been a unifying metric that accounts for experimental limitations, such as the amount of noise, the number of neural recording sites, and the number of trials, and computational limitations, such as the complexity of the decoding classifier and the number of classifier training examples. In this work, we perform a direct comparison that corrects for these experimental limitations and computational considerations. As part of our methodology, we propose an extension of \"kernel analysis\" that measures the generalization accuracy as a function of representational complexity. Our evaluations show that, unlike previous bio-inspired models, the latest DNNs rival the representational performance of IT cortex on this visual object recognition task. Furthermore, we show that models that perform well on measures of representational performance also perform well on measures of representational similarity to IT, and on measures of predicting individual IT multi-unit responses. Whether these DNNs rely on computational mechanisms similar to the primate visual system is yet to be determined, but, unlike all previous bio-inspired models, that possibility cannot be ruled out merely on representational performance grounds.

}, doi = {10.1371/journal.pcbi.1003963}, url = {https://journals.plos.org/ploscompbiol/article/file?id=10.1371/journal.pcbi.1003963\&type=printable}, author = {Cadieu, Charles F. and Hong, Ha and Yamins, Daniel L. K. and Pinto, Nicolas and Ardila, Diego and Solomon, Ethan A. and Majaj, Najib J. and DiCarlo, James J.}, editor = {Bethge, Matthias} } @article {33, title = {Neural Mechanisms Underlying Visual Object Recognition}, journal = {Cold Spring Harbor Symposia on Quantitative Biology}, volume = {79}, year = {2014}, month = {2014}, pages = {99 - 107}, abstract = {

Invariant visual object recognition and the underlying neural representations are fundamental to higher-level human cognition. To understand these neural underpinnings, we combine human and monkey psychophysics, large-scale neurophysiology, neural perturbation methods, and computational modeling to construct falsifiable, predictive models that aim to fully account for the neural encoding and decoding processes that underlie visual object recognition. A predictive encoding model must minimally describe the transformation of the retinal image to population patterns of neural activity along the entire cortical ventral stream of visual processing and must accurately predict the responses to any retinal image. A predictive decoding model must minimally describe the transformation from those population patterns of neural activity to observed object recognition behavior (i.e., subject reports), and, given that population pattern of activity, it must accurately predict behavior for any object recognition task. To date, we have focused on core object recognition-a remarkable behavior that is accomplished with image viewing durations of \<200 msec. Our work thus far reveals that the neural encoding process is reasonably well explained by a largely feed-forward, highly complex, multistaged nonlinear neural network-the current best neuronal simulation models predict approximately one-half of the relevant neuronal response variance across the highest levels of the ventral stream (areas V4 and IT). Remarkably, however, the decoding process from IT to behavior for all object recognition tasks tested thus far is very accurately predicted by simple direct linear conversion of the inferior temporal neural population state to behavior choice. We have recently examined the behavioral consequences of direct suppression of IT neural activity using pharmacological and optogenetic methods and find them to be well-explained by the same linear decoding model.

}, issn = {0091-7451}, doi = {10.1101/sqb.2014.79.024729}, url = {http://symposium.cshlp.org/content/79/99.full.pdf+html}, author = {Afraz, Arash and Yamins, Daniel L.K. and DiCarlo, James J.} } @article {31, title = {Performance-optimized hierarchical models predict neural responses in higher visual cortex}, journal = {Proceedings of the National Academy of Sciences}, year = {2014}, month = {10/2014}, pages = {8619 - 8624}, abstract = {

The ventral visual stream underlies key human visual object recognition abilities. However, neural encoding in the higher areas of the ventral stream remains poorly understood. Here, we describe a modeling approach that yields a quantitatively accurate model of inferior temporal (IT) cortex, the highest ventral cortical area. Using high-throughput computational techniques, we discovered that, within a class of biologically plausible hierarchical neural network models, there is a strong correlation between a model\’s categorization performance and its ability to predict individual IT neural unit response data. To pursue this idea, we then identified a high-performing neural network that matches human performance on a range of recognition tasks. Critically, even though we did not constrain this model to match neural data, its top output layer turns out to be highly predictive of IT spiking responses to complex naturalistic images at both the single site and population levels. Moreover, the model\’s intermediate layers are highly predictive of neural responses in the V4 cortex, a midlevel visual area that provides the dominant cortical input to IT. These results show that performance optimization\—applied in a biologically appropriate model class\—can be used to build quantitative predictive models of neural processing.

}, issn = {0027-8424}, doi = {10.1073/pnas.1403112111}, url = {http://www.pnas.org/cgi/doi/10.1073/pnas.1403112111}, author = {Yamins, D. L. K. and Hong, H. and Cadieu, C. F. and Solomon, E. A. and Seibert, D. and DiCarlo, J. J.} } @conference {37, title = {Hierarchical Modular Optimization of Convolutional Networks Achieves Representations Similar to Macaque IT and Human Ventral Stream}, booktitle = {Advances in Neural Information Processing Systems}, year = {2013}, month = {12/2013}, address = {Lake Tahoe, Nevada, United States.}, abstract = {

Humans recognize visually-presented objects rapidly and accurately. To understand this ability, we seek to construct models of the ventral stream, the series of cortical areas thought to subserve object recognition. One tool to assess the quality of a model of the ventral stream is the Representational Dissimilarity Matrix (RDM), which uses a set of visual stimuli and measures the distances produced in either the brain (i.e. fMRI voxel responses, neural firing rates) or in models (features). Previous work has shown that all known models of the ventral stream fail to capture the RDM pattern observed in either IT cortex, the highest ventral area, or in the human ventral stream. In this work, we construct models of the ventral stream using a novel optimization procedure for category-level object recognition problems, and produce RDMs resembling both macaque IT and human ventral stream. The model, while novel in the optimization procedure, further develops a long-standing functional hypothesis that the ventral visual stream is a hierarchically arranged series of processing stages optimized for visual object recognition.

}, doi = {https://papers.nips.cc/paper/4991-hierarchical-modular-optimization-of-convolutional-networks-achieves-representations-similar-to-macaque-it-and-human-ventral-stream}, url = {https://papers.nips.cc/paper/4991-hierarchical-modular-optimization-of-convolutional-networks-achieves-representations-similar-to-macaque-it-and-human-ventral-stream.pdf}, author = {Daniel L. K. Yamins and Ha Hong and Cadieu, Charles F and DiCarlo, James J.} } @article {34, title = {Large-Scale, High-Resolution Neurophysiological Maps Underlying fMRI of Macaque Temporal Lobe}, journal = {Journal of Neuroscience}, volume = {33}, year = {2013}, month = {2013}, pages = {15207 - 15219}, abstract = {

Maps obtained by functional magnetic resonance imaging (fMRI) are thought to reflect the underlying spatial layout of neural activity. However, previous studies have not been able to directly compare fMRI maps to high-resolution neurophysiological maps, particularly in higher level visual areas. Here, we used a novel stereo microfocal x-ray system to localize thousands of neural recordings across monkey inferior temporal cortex (IT), construct large-scale maps of neuronal object selectivity at subvoxel resolution, and compare those neurophysiology maps with fMRI maps from the same subjects. While neurophysiology maps contained reliable structure at the sub-millimeter scale, fMRI maps of object selectivity contained information at larger scales (\>2.5 mm) and were only partly correlated with raw neurophysiology maps collected in the same subjects. However, spatial smoothing of neurophysiology maps more than doubled that correlation, while a variety of alternative transforms led to no significant improvement. Furthermore, raw spiking signals, once spatially smoothed, were as predictive of fMRI maps as local field potential signals. Thus, fMRI of the inferior temporal lobe reflects a spatially low-passed version of neurophysiology signals. These findings strongly validate the widespread use of fMRI for detecting large (\>2.5 mm) neuronal domains of object selectivity but show that a complete understanding of even the most pure domains (e.g., faces vs nonface objects) requires investigation at fine scales that can currently only be obtained with invasive neurophysiological methods.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.1248-13.2013}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.1248-13.2013}, author = {Issa, E. B. and Papanastassiou, A. M. and DiCarlo, J. J.} } @article {36, title = {The Neural Representation Benchmark and its Evaluation on Brain and Machine}, journal = {arXiv}, year = {2013}, month = {01/2013}, type = {preprint}, abstract = {

A key requirement for the development of effective learning representations is their evaluation and comparison to representations we know to be effective. In natural sensory domains, the community has viewed the brain as a source of inspiration and as an implicit benchmark for success. However, it has not been possible to directly test representational learning algorithms directly against the representations contained in neural systems. Here, we propose a new benchmark for visual representations on which we have directly tested the neural representation in multiple visual cortical areas in macaque (utilizing data from [Majaj et al., 2012]), and on which any computer vision algorithm that produces a feature space can be tested. The benchmark measures the effectiveness of the neural or machine representation by computing the classification loss on the ordered eigendecomposition of a kernel matrix [Montavon et al., 2011]. In our analysis we find that the neural representation in visual area IT is superior to visual area V4. In our analysis of representational learning algorithms, we find that three-layer models approach the representational performance of V4 and the algorithm in [Le et al., 2012] surpasses the performance of V4. Impressively, we find that a recent supervised algorithm [Krizhevsky et al., 2012] achieves performance comparable to that of IT for an intermediate level of image variation difficulty, and surpasses IT at a higher difficulty level. We believe this result represents a major milestone: it is the first learning algorithm we have found that exceeds our current estimate of IT representation performance. We hope that this benchmark will assist the community in matching the representational performance of visual cortex and will serve as an initial rallying point for further correspondence between representations derived in brains and machines.

}, doi = {https://arxiv.org/abs/1301.3530}, url = {https://arxiv.org/pdf/1301.3530.pdf}, author = {Cadieu, Charles F. and Ha Hong and Daniel L. K. Yamins and Pinto, Nicolas and Majaj, Najib J and James J. DiCarlo} } @article {35, title = {Shape Similarity, Better than Semantic Membership, Accounts for the Structure of Visual Object Representations in a Population of Monkey Inferotemporal Neurons}, journal = {PLoS Computational Biology}, volume = {9}, year = {2013}, month = {08/2013}, pages = {e1003167}, abstract = {

The anterior inferotemporal cortex (IT) is the highest stage along the hierarchy of visual areas that, in primates, processes visual objects. Although several lines of evidence suggest that IT primarily represents visual shape information, some recent studies have argued that neuronal ensembles in IT code the semantic membership of visual objects (i.e., represent conceptual classes such as animate and inanimate objects). In this study, we investigated to what extent semantic, rather than purely visual information, is represented in IT by performing a multivariate analysis of IT responses to a set of visual objects. By relying on a variety of machine-learning approaches (including a cutting-edge clustering algorithm that has been recently developed in the domain of statistical physics), we found that, in most instances, IT representation of visual objects is accounted for by their similarity at the level of shape or, more surprisingly, low-level visual properties. Only in a few cases we observed IT representations of semantic classes that were not explainable by the visual similarity of their members. Overall, these findings reassert the primary function of IT as a conveyor of explicit visual shape information, and reveal that low-level visual properties are represented in IT to a greater extent than previously appreciated. In addition, our work demonstrates how combining a variety of state-of-the-art multivariate approaches, and carefully estimating the contribution of shape similarity to the representation of object categories, can substantially advance our understanding of neuronal coding of visual objects in cortex.

}, doi = {10.1371/journal.pcbi.1003167}, url = {https://dx.plos.org/10.1371/journal.pcbi.1003167}, author = {Baldassi, Carlo and Alemi-Neissi, Alireza and Pagan, Marino and DiCarlo, James J. and Zecchina, Riccardo and Zoccolan, Davide} } @article {40, title = {Balanced Increases in Selectivity and Tolerance Produce Constant Sparseness along the Ventral Visual Stream}, journal = {Journal of Neuroscience}, volume = {32}, year = {2012}, month = {07/2012}, pages = {10170 - 10182}, abstract = {

Although popular accounts suggest that neurons along the ventral visual processing stream become increasingly selective for particular objects, this appears at odds with the fact that inferior temporal cortical (IT) neurons are broadly tuned. To explore this apparent contradiction, we compared processing in two ventral stream stages (visual cortical areas V4 and IT) in the rhesus macaque monkey. We confirmed that IT neurons are indeed more selective for conjunctions of visual features than V4 neurons and that this increase in feature conjunction selectivity is accompanied by an increase in tolerance (\"invariance\") to identity-preserving transformations (e.g., shifting, scaling) of those features. We report here that V4 and IT neurons are, on average, tightly matched in their tuning breadth for natural images (\"sparseness\") and that the average V4 or IT neuron will produce a robust firing rate response (\>50\% of its peak observed firing rate) to \∼10\% of all natural images. We also observed that sparseness was positively correlated with conjunction selectivity and negatively correlated with tolerance within both V4 and IT, consistent with selectivity-building and invariance-building computations that offset one another to produce sparseness. Our results imply that the conjunction-selectivity-building and invariance-building computations necessary to support object recognition are implemented in a balanced manner to maintain sparseness at each stage of processing.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.6125-11.2012}, url = {http://www.jneurosci.org/content/32/30/10170.full.pdf+html}, author = {Rust, N. C. and DiCarlo, J. J.} } @article {41, title = {How Does the Brain Solve Visual Object Recognition?}, journal = {Neuron}, volume = {73}, year = {2012}, month = {01/2012}, pages = {415 - 434}, abstract = {

Mounting evidence suggests that \&$\#$39;core object recognition,\&$\#$39; the ability to rapidly recognize objects despite substantial appearance variation, is solved in the brain via a cascade of reflexive, largely feedforward computations that culminate in a powerful neuronal representation in the inferior temporal cortex. However, the algorithm that produces this solution remains poorly understood. Here we review evidence ranging from\ individual neurons and neuronal populations to behavior and computational models. We propose that understanding this algorithm will require using neuronal and psychophysical data to sift through many computational models, each based on building blocks of small, canonical subnetworks with a common functional goal.

}, issn = {08966273}, doi = {10.1016/j.neuron.2012.01.010}, url = {https://www.sciencedirect.com/science/article/pii/S089662731200092X}, author = {DiCarlo, ~J. and Zoccolan, Davide and Rust, ~C.} } @article {39, title = {Neuronal Learning of Invariant Object Representation in the Ventral Visual Stream Is Not Dependent on Reward}, journal = {Journal of Neuroscience}, volume = {32}, year = {2012}, month = {09/2012}, pages = {6611 - 6620}, abstract = {

Neurons at the top of primate ventral visual stream [inferior temporal cortex (IT)] have selectivity for objects that is highly tolerant to variation in the object\&$\#$39;s appearance on the retina. Previous nonhuman primate (Macaca mulatta) studies suggest that this neuronal tolerance is at least partly supported by the natural temporal contiguity of visual experience, because altering that temporal contiguity can robustly alter adult IT position and size tolerance. According to that work, it is the statistics of the subject\&$\#$39;s visual experience, not the subject\&$\#$39;s reward, that instruct the specific images that IT treats as equivalent. But is reward necessary for gating this type of learning in the ventral stream? Here we show that this is not the case\—temporal tolerance learning proceeds at the same rate, regardless of reward magnitude and regardless of the temporal co-occurrence of reward, even in a behavioral task that does not require the subject to engage the object images. This suggests that the ventral visual stream uses autonomous, fully unsupervised mechanisms to constantly leverage all visual experience to help build its invariant object representation.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.3786-11.2012}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.3786-11.2012}, author = {Li, N. and DiCarlo, J. J.} } @article {38, title = {Precedence of the Eye Region in Neural Processing of Faces}, journal = {Journal of Neuroscience}, volume = {32}, year = {2012}, month = {09/2013}, pages = {16666 - 16682}, abstract = {

Functional magnetic resonance imaging (fMRI) has revealed multiple subregions in monkey inferior temporal cortex (IT) that are selective for images of faces over other objects. The earliest of these subregions, the posterior lateral face patch (PL), has not been studied previously at the neurophysiological level. Perhaps not surprisingly, we found that PL contains a high concentration of \"face-selective\" cells when tested with standard image sets comparable to those used previously to define the region at the level of fMRI. However, we here report that several different image sets and analytical approaches converge to show that nearly all face-selective PL cells are driven by the presence of a single eye in the context of a face outline. Most strikingly, images containing only an eye, even when incorrectly positioned in an outline, drove neurons nearly as well as full-face images, and face images lacking only this feature led to longer latency responses. Thus, bottom-up face processing is relatively local and linearly integrates features-consistent with parts-based models-grounding investigation of how the presence of a face is first inferred in the IT face processing hierarchy.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.2391-12.2012}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.2391-12.2012}, author = {Issa, E. B. and DiCarlo, J. J.} } @conference {42, title = {A unified neuronal population code fully explains human object recognition.}, booktitle = {Computation and Systems Neuroscience (COSYNE)}, year = {2012}, month = {02/2012}, address = {Salt Lake City, Utah, USA}, abstract = {

Our goal is to understand the neuronal mechanisms that underlie human visual object recognition (OR). While previous work has argued for qualitative links between neuronal responses in the ventral visual stream and human shape judgements, no study has asked which, if any, neuronal responses are quantitatively sufficient to explain broad domain human OR performance. The shift from qualitative to quantitative hypotheses requires a framework to link neuronal responses to behavior (\“unified code\”). Here we ask: is there a common neuronal basis (e.g., in IT cortex) and a simple (e.g., linear) transformation that will predict all of human OR performance? We first defined OR operationally by obtaining human psychophysical measurements using images that explore shape similarity and identity preserving image variation, resulting in OR benchmarks that span a range of difficulty. Using the same visual images, we measured neuronal responses in V4 and IT in two monkeys. We implemented 14 unified codes based on those neuronal data and computed cross-validated neuronal discriminability indices (d\’s) to compare to the human d\’s. The dynamic range across those d\’s sets a high bar for when a putative code is sufficient to explain behavior: it is not sufficient for a code to perform well (high d\’) or to match one d\’. Instead, a sufficient unified code must also emergently predict the entire pattern of behavior over all tasks. Remarkably, we found a few unified IT-based codes that meet this high bar. Interestingly, many other IT codes and all V4 codes are insufficient. While humans outperform computer vision systems on many of our OR tasks, their abilities reliably depend on the images tested. These dependencies in human performance are fully explained by a simple, unified reading of monkey ventral stream neurons, a feat unmatched by any computer vision system we tested

}, doi = {http://www.cosyne.org/c/index.php?title=Cosyne_12}, url = {http://cosyne.org/cosyne12/Cosyne2012_program_book.pdf}, author = {Majaj, Najib J and Hong, Ha and Solomon, EA and DiCarlo, James J.} } @proceedings {43, title = {Comparing-State-of-the-Art Visual Features on Invariant Object Recognition Tasks}, journal = {IEEE Workshop on Applications of Computer Vision (WACV)}, year = {2011}, month = {01/2011}, pages = {463-470}, publisher = {IEEE}, address = {Kona, Hawaii, USA}, abstract = {

Tolerance (\“invariance\”) to identity-preserving image variation (e.g. variation in position, scale, pose, illumination) is a fundamental problem that any visual object recognition system, biological or engineered, must solve. While standard natural image database benchmarks are useful for guiding progress in computer vision, they can fail to probe the ability of a recognition system to solve the invariance problem. Thus, to understand which computational approaches are making progress on solving the invariance problem, we compared and contrasted a variety of state-of-the-art visual representations using synthetic recognition tasks designed to systematically probe invariance. We successfully re-implemented a variety of state-of-the-art visual representations and confirmed their published performance on a natural image benchmark. We here report that most of these representations perform poorly on invariant recognition, but that one representation shows significant performance gains over two baseline representations. We also show how this approach can more deeply illuminate the strengths and weaknesses of different visual representations and thus guide progress on invariant object recognition.

}, issn = {978-1-4244-9497-2}, doi = {10.1109/WACV.2011.5711540}, url = {https://ieeexplore.ieee.org/document/5711540}, author = {Pinto, Nicolas and Barhomi, Y and Cox, David D. and DiCarlo, James J.} } @proceedings {44, title = {From luminance to semantics: how natural objects are represented in monkey inferotemporal cortex}, journal = {Computational and Systems Neuroscience (COSYNE)}, year = {2011}, month = {02/2011}, publisher = {COSYNE}, address = {Salt Lake City, Utah, USA}, abstract = {

In primates, visual object information is processed through a hierarchy of cortico-cortical stages that culminates with the inferotemporal cortex (IT). Although the nature of visual processing in IT is still poorly understood, several lines of evidence suggest that IT conveys an explicit object representation that can directly serve as a basis for decision, action and memory - e.g., it can support flexible formation of semantic categories in downstream areas, such as prefrontal and perirhinal cortex. However, some recent studies (Kiani et al, 2007; Kriegeskorte et al, 2008) have argued that IT neuronal ensembles may themselves code the semantic membership of visual objects (i.e., represent such abstract conceptual classes such as animate and inanimate objects, animals, etc). In this study, we have applied an array of multi-variate computational approaches to investigate the nature of visual objects\&$\#$39; representation in IT. Our results show that IT neuronal ensembles represent a surprisingly broad spectrum of visual features complexity, ranging from low-level visual properties (e.g., brightness), to visual patterns of intermediate complexity (e.g., star-like shapes), to complex objects (e.g., four-leg animals) that appear to be coded so invariantly that their clustering in the IT neuronal space is not easily accountable by any similarity metric we used. On the one hand, these findings show that IT supports recognition of low-level properties of the visual input that are typically though to be extracted by lower-level visual areas. On the other hand, IT appears to convey such an explicit representation of some object classes that coding of semantic membership in IT (at least for a few categories) cannot be excluded. Overall, these results shed new light on IT amazing pluripotency in supporting recognition tasks as diverse as detection of brightness and categorization of complex shapes.

}, url = {http://www.cosyne.org/c/index.php?title=Cosyne_11_posters/I-89}, author = {Pagan, Marino and Neissi, AA and Baldassi, Carlo and Zecchina, Riccardo and DiCarlo, James J. and Zoccolan, Davide} } @inbook {49, title = {Do we have a strategy for understanding how the visual system accomplishes object recognition?}, booktitle = {Object Categorization: Computer and Human Vision Perspectives}, year = {2010}, publisher = {Cambridge University Press}, organization = {Cambridge University Press}, address = {New York, NY, USA}, issn = {978-0-521-88738-0}, author = {DiCarlo, James J.}, editor = {Dickenson, SJ and Leonardis, A. and Schiele, B. and Tarr, MJ.} } @proceedings {50, title = {Does the visual system use natural experience to construct size invariant object representations?}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2010}, month = {02/2010}, address = {Salt Lake City, Utah, USA}, abstract = {

Object recognition is challenging because each object produces myriad retinal images. Responses of neurons at the top of the ventral visual stream (inferior temporal cortex, IT) exhibit object selectivity that is unaffected by the image changes. How do IT neurons attain this tolerance (\"invariance\")? One powerful idea is that temporal contiguity of natural visual experience can instruct tolerance (e.g. Foldiak, Neural Computation, 1991): because objects remain present for many seconds, whereas object or viewer motion cause changes in each object\’s retinal image over shorter time intervals, the ventral stream could construct tolerance by learning to associate neuronal representations that occur closely in time. We recently found a neuronal signature of such learning in IT: temporally contiguous experience with different object images at different retinal positions can robustly reshape (\"break\") IT position tolerance, producing a tendency for IT neurons to confuse the identities of those temporally coupled objects across their manipulated positions (Li \& DiCarlo, Science, 2008). A similar manipulation can induce the same pattern of confusion in the position tolerance of human object perception (Cox, Meier, Oertelt, DiCarlo. Nat Neurosci, 2005). Does this IT neuronal learning reflect a canonical unsupervised learning algorithm the ventral stream relies on to achieve tolerance to all types of image variation (e.g. object size and pose changes)? To begin to answer this question, we here extend our position tolerance paradigm to object size changes. Unsupervised non-human primates were exposed to an altered visual world in which we temporally coupled the experience of two object images of different sizes at each animal\’s center of gaze: (e.g.) a small image of one object (P, neuronally preferred object) was consistently followed by a large image of a second object (N), rendering the small image of P temporally contiguous with the large image of N. We made IT neuronal selectivity measurements before and after the animals received ~2 hours of experience in the unsupervised, altered visual world. Consistent with our results on position tolerance, we found that this size experience manipulation robustly reshapes IT size tolerance over a period of hours. Specifically, unlike experienced controls, we found a change in neuronal selectivity (P-N) across the manipulated objects and their manipulated sizes, producing a tendency to confuse those object identities across those sizes. This change in size tolerance is specific to the manipulated objects, grew gradually stronger with increasing experience, and the rate of learning was similar to position tolerance learning (~5 spikes/s per hour of exposure). Finally, in a separate experiment, we examine how temporal direction of the experience affects the learning: do temporally-early images teach temporally-later ones, or vice-versa? We found greater learning for the temporally-later images, suggesting a Hebbian-like learning mechanism (e.g. Sprekeler \& Gerstner, COSYNE, 2009; Wallis \& Rolls, Prog Neurobiol, 1997). We speculate that these converging results on IT position and size tolerance plasticity reflect an underlying unsupervised cortical learning mechanism by which the ventral visual stream acquires and maintains its tolerant object representations.

}, doi = {10.3389/conf.fnins.2010.03.00326}, url = {https://www.frontiersin.org/Community/AbstractDetails.aspx?ABS_DOI=10.3389/conf.fnins.2010.03.00326\&eid=770\&sname=Computational_and_Systems_Neuroscience_2010}, author = {Li, Nuo and DiCarlo, James J.} } @proceedings {52, title = {A High-Throughput Screening Approach to Biologically-Inspired Object Recognition}, journal = {Learning Workshop-Computation and Systems Neuroscience (COSYNE)}, year = {2010}, month = {03/2010}, address = {Snow Bird, Utah, USA}, author = {Pinto, Nicolas and DiCarlo, James J. and Cox, David D.} } @proceedings {51, title = {Human versus machine: comparing visual object recognition systems on a level playing field}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2010}, month = {02/2010}, address = {Salt Lake City, Utah, USA}, abstract = {

It is received wisdom that biological visual systems easily outmatch current artificial systems at complex visual tasks like object recognition. But have the appropriate comparisons been made? Because artificial systems are improving every day, they may surpass human performance some day. We must understand our progress toward reaching that day, because that success is one of several necessary requirements for \"understanding\" visual object recognition. How large (or small) is the difference in performance between current state-of-the-art object recognition systems and the primate visual system? In practice, the performance comparison of any two object recognition systems requires a focus on the computational crux of the problem and sets of images that engage it. Although it is widely believed that tolerance (\"invariance\") to identity-preserving image variation (e.g. variation in object position, scale, pose, illumination) is critical, systematic comparisons of state-of-the-art artificial visual representations almost always rely on \"natural\" image databases that can fail to probe the ability of a recognition system to solve the invariance problem [Pinto et al PLoS08, COSYNE08, ECCV08, CVPR09]. Thus, to understand how well current state-of-the-art visual representations perform relative to each other, relative to low-level neuronal representations (e.g. retinal-like and V1-like), and relative to high-level representations (e.g. human performance), we tested all of these representations on a common set of visual object recognition tasks that directly engage the invariance problem. Specifically, we used a synthetic testing approach that allows direct engagement of the invariance problem, as well as knowledge and control of all the key parameters that make object recognition challenging. We successfully re-implemented a variety of state-of-the-art visual representations, and we confirmed the high published performance of all of these state-of-the-art representations on large, complex \"natural\â\€ image benchmarks. Surprisingly, we found that most of these representations were weak on our simple synthetic tests of invariant recognition, and only high-level biologically-inspired representations showed performance gains above the neuroscience \"null\" representation (V1-like). While in aggregate, we found that the performance of these state-of-the-art representations pales in comparison to human performance, humans and computers seem to fail in different and potentially enlightening ways when faced with the problem of invariance. We also show how our synthetic testing approach can more deeply illuminate the strengths and weaknesses of different visual representations and thus guide progress on invariant object recognition.

}, doi = {10.3389/conf.fnins.2010.03.00283}, url = {https://www.frontiersin.org/Community/AbstractDetails.aspx?ABS_DOI=10.3389/conf.fnins.2010.03.00283\&eid=770\&sname=Computational_and_Systems_Neuroscience_2010}, author = {Pinto, Nicolas and Majaj, Najib J and Barhomi, Y and Solomon, E. A. and Cox, David D. and DiCarlo, James J.} } @article {45, title = {Selectivity and Tolerance ("Invariance") Both Increase as Visual Information Propagates from Cortical Area V4 to IT}, journal = {Journal of Neuroscience}, volume = {30}, year = {2010}, month = {09/2010}, pages = {12978 - 12995}, abstract = {

Our ability to recognize objects despite large changes in position, size, and context is achieved through computations that are thought to increase both the shape selectivity and the tolerance (\"invariance\") of the visual representation at successive stages of the ventral pathway [visual cortical areas V1, V2, and V4 and inferior temporal cortex {(IT)].} However, these ideas have proven difficult to test. Here, we consider how well population activity patterns at two stages of the ventral stream {(V4} and {IT)} discriminate between, and generalize across, different images. We found that both V4 and {IT} encode natural images with similar fidelity, whereas the {IT} population is much more sensitive to controlled, statistical scrambling of those images. Scrambling sensitivity was proportional to receptive field {(RF)} size in both V4 and {IT,} suggesting that, on average, the number of visual feature conjunctions implemented by a V4 or {IT} neuron is directly related to its {RF} size. We also found that the {IT} population could better discriminate between objects across changes in position, scale, and context, thus directly demonstrating a {V4-to-IT} gain in tolerance. This tolerance gain could be accounted for by both a decrease in single-unit sensitivity to identity-preserving transformations (e.g., an increase in {RF} size) and an increase in the maintenance of rank-order object selectivity within the {RF.} These results demonstrate that, as visual information travels from V4 to {IT,} the population representation is reformatted to become more selective for feature conjunctions and more tolerant to identity preserving transformations, and they reveal the single-unit response properties that underlie that reformatting.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.0179-10.2010}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.0179-10.2010}, author = {Rust, N. C. and DiCarlo, J. J.} } @proceedings {48, title = {Towards large-scale, high resolution maps of object selectivity in inferior temporal cortex}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2010}, month = {02/2010}, address = {Salt Lake City, Utah, USA}, abstract = {

Inferior temporal cortex (IT) has been shown to have large-scale (mm to cm) maps of object category selectivity as well as small-scale (sub-millimeter) organization for object features.\  These two scales of spatial organization have yet to be linked because they were measured using different techniques (fMRI, optical imaging, and local electrophysiology), each with their own limitations.\  For example, fMRI has poor spatial resolution while optical imaging has higher resolution at the expense of a narrow field of view of only surface-accessible cortex.\  Given that much of IT lies inside a major cortical sulcus or at the skull base, what is needed is a method that can access the whole of IT with high resolution.\  Microelectrode based mapping has such potential: electrodes can reach almost anywhere in IT (high spatial coverage) and record from single cells (high spatial resolution).\ This potential has not yet been realized because of the difficulty of precisely localizing and co-registering many electrode recordings in vivo.\  Methods such as histological reconstruction of lesion sites or MRI visualization of electrodes are post-hoc and can introduce additional spatial errors.\  Here, we have adopted a microfocal stereo x-ray system for localizing electrodes that can be used at an unlimited number of sites and operates virtually in real-time (Cox et al., J. Neurophys. 2008).\  We have used this system to construct broad scale maps of object category selectivity in IT for comparison to fMRI-based maps.\  We found a weak but significant correspondence between physiology and fMRI maps collected in the same animal, and this correspondence improved substantially when MUA and LFP signals were smoothed (~3-5 mm) to broader scales suggesting the spatial low pass nature of fMRI.\  Transformations other than spatial smoothing such as dividing the LFP into power in different frequency bands did not produce noticeable improvement in map correspondence.

}, doi = {10.3389/conf.fnins.2010.03.00154}, url = {https://www.frontiersin.org/Community/AbstractDetails.aspx?ABS_DOI=10.3389/conf.fnins.2010.03.00154\&eid=770\&sname=computational_and_systems_neur}, author = {Issa, Elias B. and Papanastassiou, A. M. and Andken, BB and DiCarlo, James J.} } @article {46, title = {Unsupervised Natural Visual Experience Rapidly Reshapes Size-Invariant Object Representation in Inferior Temporal Cortex}, journal = {Neuron}, volume = {67}, year = {2010}, month = {01/2010}, pages = {1062 - 1075}, abstract = {

We easily recognize objects and faces across\ a myriad of retinal images produced by each object. One hypothesis is that this tolerance (a.k.a. \"invariance\") is learned by relying on the fact that object identities are temporally stable. While we previously found neuronal evidence supporting this idea at the top of the nonhuman primate ventral visual stream (inferior temporal cortex, or {IT),} we here test if this is a general tolerance learning mechanism. First, we found that the same type of unsupervised experience that reshaped {IT} position tolerance also predictably reshaped {IT} size tolerance, and the magnitude of reshaping was quantitatively similar. Second, this tolerance reshaping can be induced under naturally occurring dynamic visual experience, even without eye movements. Third, unsupervised temporal contiguous experience can build new neuronal tolerance. These results suggest that the ventral visual stream uses a general unsupervised tolerance learning algorithm to build its invariant object representation.

}, issn = {08966273}, doi = {10.1016/j.neuron.2010.08.029}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0896627310006392}, author = {Li, Nuo and DiCarlo, James J.} } @proceedings {47, title = {What is the middle face patch?}, journal = {Society for Neuroscience}, volume = {40}, year = {2010}, month = {2010}, pages = {581.8}, publisher = {SFN }, address = {San Diego, CA, USA}, abstract = {

Functional MRI in the monkey has revealed at least six patches of tissue in the ventral visual cortical stream that prefer images of faces over non-face objects. Neuronal recordings targeted to the largest patch (\“middle face patch\”), hypothesized to be homologous to the human FFA, revealed that nearly all visually driven cells were at least 2x more responsive to faces over non-face images. These results support a view in which this region consists entirely of \“face cells\” involved exclusively in face processing. However, previous work could not distinguish between a single large cluster, several clusters, or a local, graded enrichment of \“face cells\”. More deeply, while \“face cells\” are typically defined as those with a preference for face stimuli on average, it is not clear what model best explains their response properties. A strong form of the \“face cell\” hypothesis is a semantic response model in which each cell responds only to images that an observer would distinguish as a face. An alternative class of models places these cells at an intermediate processing stage along the ventral visual feature hierarchy in which some face selectivity has been achieved, but is the result of neuronal tuning for complex visual features found in both face and some non-face objects. To explore these questions, we used a custom built high resolution x-ray imaging system to map the spiking selectivity at hundreds of sites in and around the fMRI-identified macaque middle face patch using a large class of face and non-face stimuli. Our current data show a single region of face selectivity, approximately 2-3 mm in diameter, in rough correspondence with the fMRI-identified face patch. We confirmed previous results in that ~90\% of neuronal sites in this region are \“face selective\” when evaluated with simple face vs. object response criteria. However, we found that none of these sites maintained a strict semantic designation by responding significantly more to all face images than to any non face image. Moreover, at least ~60\% of sites show a significantly greater response to at least one non-face image, such as images of fruits, cars, and toys, over at least one face image, such as caricatures of human and monkey faces. These results argue against a strong semantic response model of the middle \“face patch.\” Instead, the most parsimonious interpretation of these data is that neurons in the fMRI-identified middle \“face patch\” are located at an intermediate stage of visual processing: although they may be tapped at higher areas to build a single-neuron explicit, semantic level representation of faces, their role may be to represent intermediate visual features that are also found in non-face objects.

}, url = {https://www.abstractsonline.com/Plan/ViewAbstract.aspx?sKey=e08f5ff4-1ba9-4faf-a459-5c9d4be0a1bf\&cKey=36fa0d7d-3e83-4910-be75-57d361ae9e58\&mKey=\%7bE5D5C83F-CE2D-4D71-9DD6-FC7231E090FB\%7d}, author = {Aparicio, Paul L. and Issa, Elias B. and DiCarlo, James J.} } @article {55, title = {Balanced increases in selectivity and invariance produce constant sparseness across the ventral visual pathway}, journal = {Journal of Vision}, volume = {9}, year = {2009}, month = {Jan-08-2009}, pages = {738 - 738}, abstract = {

While several studies report neurons in inferotemporal cortex (IT) that are highly selective for particular objects or images, other studies report that neurons in IT tend to be broadly tuned. To investigate how selectivity changes across the ventral visual pathway, we compared the responses of neurons in a mid-level visual area (V4) and a high-level visual area (IT). We first assessed the selectivity of neurons in each area by determining how well each population could discriminate between natural images and \“scrambled\” versions of those images that have the same low-level structure but configured randomly. We found that the V4 population discriminated between members of the two image sets with similar fidelity whereas discrimination by the IT population was considerably degraded for the scrambled images. These results suggest that IT neurons are in fact more selective than V4 neurons in terms of the image features that drive these cells. As a second estimate of selectivity, we measured the tuning bandwidth of neurons for natural images (\“sparseness\”). Surprisingly, we found that distributions of sparseness values were indistinguishable between V4 and IT. How can the selectivity for natural image features increase while the tuning bandwidth for natural images remains constant? One possibility is that increases in selectivity for particular image features are offset by increases in tolerance for the (e.g.) position and scale of those features. We found that indeed, measures of tolerance were higher in IT than V4. These results confirm that neurons increase both their selectivity for image features and their tolerance to changes in the position and scale of those features as signals propagate through the ventral pathway. Remarkably, the rates of increase of these two parameters appear to be set such that the tuning bandwidth for natural images is maintained across each stage of cortical processing.

}, doi = {10.1167/9.8.738}, url = {http://jov.arvojournals.org/Article.aspx?doi=10.1167/9.8.738}, author = {Rust, N. C. and DiCarlo, J. J.} } @article {56, title = {A High-Throughput Screening Approach to Discovering Good Forms of Biologically Inspired Visual Representation}, journal = {PLoS Computational Biology}, volume = {5}, year = {2009}, month = {2009}, pages = {e1000579}, abstract = {

While many models of biological object recognition share a common set of \"broad-stroke\" properties, the performance of any one model depends strongly on the choice of parameters in a particular instantiation of that model\–e.g., the number of units per layer, the size of pooling kernels, exponents in normalization operations, etc. Since the number of such parameters (explicit or implicit) is typically large and the computational cost of evaluating one particular parameter set is high, the space of possible model instantiations goes largely unexplored. Thus, when a model fails to approach the abilities of biological visual systems, we are left uncertain whether this failure is because we are missing a fundamental idea or because the correct \"parts\" have not been tuned correctly, assembled at sufficient scale, or provided with enough training. Here, we present a high-throughput approach to the exploration of such parameter sets, leveraging recent advances in stream processing hardware (high-end {NVIDIA} graphic cards and the {PlayStation} 3\&$\#$39;s {IBM} Cell Processor). In analogy to high-throughput screening approaches in molecular biology and genetics, we explored thousands of potential network architectures and parameter instantiations, screening those that show promising object recognition performance for further analysis. We show that this approach can yield significant, reproducible gains in performance across an array of basic object recognition tasks, consistently outperforming a variety of state-of-the-art purpose-built vision systems from the literature. As the scale of available computational power continues to expand, we argue that this approach has the potential to greatly accelerate progress in both artificial vision and our understanding of the computational underpinning of biological vision.

}, doi = {10.1371/journal.pcbi.1000579}, url = {https://dx.plos.org/10.1371/journal.pcbi.1000579}, author = {Pinto, Nicolas and Doukhan, David and DiCarlo, James J. and Cox, David D.}, editor = {Friston, Karl J.} } @conference {57, title = {How far can you get with a modern face recognition test set using only simple features?}, booktitle = {IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops (CVPR Workshops)}, year = {2009}, month = {06/2009}, publisher = {IEEE}, organization = {IEEE}, address = {Miami, FL}, abstract = {

In recent years, large databases of natural images have become increasingly popular in the evaluation of face and object recognition algorithms. However, Pinto et al. previously illustrated an inherent danger in using such sets, showing that an extremely basic recognition system, built on a trivial feature set, was able to take advantage of low-level regularities in popular object and face recognition sets, performing on par with many state-of-the-art systems. Recently, several groups have raised the performance \“bar\” for these sets, using more advanced classification tools. However, it is difficult to know whether these improvements are due to progress towards solving the core computational problem, or are due to further improvements in the exploitation of low-level regularities. Here, we show that even modest optimization of the simple model introduced by Pinto et al. using modern multiple kernel learning (MKL) techniques once again yields \“state-of-the-art\” performance levels on a standard face recognition set (\“labeled faces in the wild\”). However, at the same time, even with the inclusion of MKL techniques, systems based on these simple features still fail on a synthetic face recognition test that includes more \“realistic\” view variation by design. These results underscore the importance of building test sets focussed on capturing the central computational challenges of real-world face recognition.

}, isbn = {978-1-4244-3992-8}, doi = {10.1109/CVPR.2009.5206605}, url = {https://ieeexplore.ieee.org/document/5206605}, author = {Pinto, Nicolas and DiCarlo, James J. and Cox, David D.} } @article {58, title = {A rodent model for the study of invariant visual object recognition}, journal = {Proceedings of the National Academy of Sciences}, volume = {106}, year = {2009}, month = {05/2009}, pages = {8748 - 8753}, abstract = {

The human visual system is able to recognize objects despite tremendous variation in their appearance on the retina resulting from variation in view, size, lighting, etc. This ability\–known as \"invariant\" object recognition\–is central to visual perception, yet its computational underpinnings are poorly understood. Traditionally, nonhuman primates have been the animal model-of-choice for investigating the neuronal substrates of invariant recognition, because their visual systems closely mirror our own. Meanwhile, simpler and more accessible animal models such as rodents have been largely overlooked as possible models of higher-level visual functions, because their brains are often assumed to lack advanced visual processing machinery. As a result, little is known about rodents\&$\#$39; ability to process complex visual stimuli in the face of real-world image variation. In the present work, we show that rats possess more advanced visual abilities than previously appreciated. Specifically, we trained pigmented rats to perform a visual task that required them to recognize objects despite substantial variation in their appearance, due to changes in size, view, and lighting. Critically, rats were able to spontaneously generalize to previously unseen transformations of learned objects. These results provide the first systematic evidence for invariant object recognition in rats and argue for an increased focus on rodents as models for studying high-level visual processing.

}, issn = {0027-8424}, doi = {10.1073/pnas.0811583106}, url = {http://www.pnas.org/cgi/doi/10.1073/pnas.0811583106}, author = {Zoccolan, D. and Oertelt, N. and DiCarlo, J. J. and Cox, D. D.} } @proceedings {59, title = {The size invariance of neuronal object representations can be reshaped by temporally contiguous visual experience}, journal = {Society for Neuroscience}, year = {2009}, month = {10/2009}, pages = {306.10}, publisher = {SFN}, address = {Chicagom, IL, USA}, abstract = {

Each object can cast an infinite number of different images on the retina and understanding how the brain tolerates this image variation is the key to solving object recognition. The responses of neurons at the top of the ventral visual stream (inferior temporal cortex, IT) exhibit tolerant object selectivity. How do IT neurons attain this tolerance (\“invariance\”)?
One powerful idea is that temporal contiguity of natural visual experience can instruct tolerance (Foldiak, 1991): because objects remain present for many seconds, whereas object or viewer motion cause changes in each object\’s retinal image over shorter time intervals, the ventral stream could construct tolerance by learning to associate neuronal representations that occur closely in time. We recently found a neuronal signature of such learning in IT: temporally contiguous experience with different object images at different retinal positions can robustly reshape (\“break\”) IT position tolerance, producing a tendency to confuse the identities of temporally coupled objects across their manipulated positions (Li \& DiCarlo 2008). A similar manipulation can induce the same pattern of confusion in the position tolerance of human object perception (Cox et al. 2005).
Does this IT neuronal learning reflect a canonical unsupervised learning algorithm the ventral stream relies on to achieve tolerance to all types of image variation (e.g. object size and pose changes)? To begin to answer this question, we here extend our previous position tolerance paradigm to object size changes. Non-human primates were exposed to an unsupervised, altered visual world in which we temporally coupled the experience of two object images of different sizes at each animal\’s center of gaze: (e.g.) a small image of one object (P, neuronally preferred object) was consistently followed by a large image of a second object (N), rendering the small image of P temporally contiguous with the large image of N.
We found that this unsupervised experience manipulation robustly reshapes IT size tolerance over a period of hours. Specifically, unlike experienced controls, we found a change in neuronal selectivity (P-N) across the manipulated objects and their manipulated sizes, producing a tendency to confuse those object identities across those sizes. This change in size tolerance grew gradually stronger with increasing experience, and the rate of learning was similar to our previous work on position tolerance. We speculate that these converging results reflect an underlying canonical learning mechanism by which the ventral visual system acquires and maintains its tolerant object representations.

}, url = {https://www.abstractsonline.com/Plan/ViewAbstract.aspx?sKey=8bb461de-0fd1-4f6a-9dfe-d62b65382083\&cKey=507938c9-dc2c-4a47-a74f-601df562eddc\&mKey=\%7b081F7976-E4CD-4F3D-A0AF-E8387992A658\%7d}, author = {Li, Nuo and DiCarlo, James J.} } @proceedings {66, title = {A systematic exploration of the relationship of fMRI signals and neuronal activity in the primate temporal lobe}, journal = {Society for Neuroscience}, year = {2009}, month = {11/2009}, publisher = {SFN}, address = {Washington, DC, USA}, author = {Papanastassiou, A. M. and Op de Beeck, H. P. and Andken, BB and DiCarlo, James J.} } @proceedings {60, title = {Unlocking Biologically-Inspired Computer Vision: a High-Throughput Approach}, journal = {NVIDIA GPU Technology Conference}, year = {2009}, month = {2009}, author = {Pinto, Nicolas and Cox, David D. and DiCarlo, James J.} } @proceedings {61, title = {Unlocking Brain-Inspired Computer Vision}, journal = {GPU@BU}, year = {2009}, month = {2009}, author = {Pinto, Nicolas and Cox, David D. and DiCarlo, James J.} } @proceedings {62, title = {The Visual Cortex and GPUs}, journal = {GPU Computing for Biomedical Research}, year = {2009}, month = {2009}, address = {Massachusetts General Hospital, Boston, MA}, author = {Pinto, Nicolas and Cox, David D. and DiCarlo, James J.} } @article {53, title = {What Response Properties Do Individual Neurons Need to Underlie Position and Clutter {\textquotedblleft}Invariant{\textquotedblright} Object Recognition?}, journal = {Journal of Neurophysiology}, volume = {102}, year = {2009}, month = {01/2009}, pages = {360 - 376}, abstract = {

Primates can easily identify visual objects over large changes in retinal position\–a property commonly referred to as position \"invariance.\" This ability is widely assumed to depend on neurons in inferior temporal cortex {(IT)} that can respond selectively to isolated visual objects over similarly large ranges of retinal position. However, in the real world, objects rarely appear in isolation, and the interplay between position invariance and the representation of multiple objects (i.e., clutter) remains unresolved. At the heart of this issue is the intuition that the representations of nearby objects can interfere with one another and that the large receptive fields needed for position invariance can exacerbate this problem by increasing the range over which interference acts. Indeed, most {IT} neurons\&$\#$39; responses are strongly affected by the presence of clutter. While external mechanisms (such as attention) are often invoked as a way out of the problem, we show (using recorded neuronal data and simulations) that the intrinsic properties of {IT} population responses, by themselves, can support object recognition in the face of limited clutter. Furthermore, we carried out extensive simulations of hypothetical neuronal populations to identify the essential individual-neuron ingredients of a good population representation. These simulations show that the crucial neuronal property to support recognition in clutter is not preservation of response magnitude, but preservation of each neuron\&$\#$39;s rank-order object preference under identity-preserving image transformations (e.g., clutter). Because {IT} neuronal responses often exhibit that response property, while neurons in earlier visual areas (e.g., V1) do not, we suggest that preserving the rank-order object preference regardless of clutter, rather than the response magnitude, more precisely describes the goal of individual neurons at the top of the ventral visual stream.

}, issn = {0022-3077}, doi = {10.1152/jn.90745.2008}, url = {https://www.physiology.org/doi/10.1152/jn.90745.2008}, author = {Li, Nuo and Cox, David D. and Zoccolan, Davide and DiCarlo, James J.} } @proceedings {69, title = {Concurrent increases in selectivity and tolerance produce constant sparseness across the ventral visual stream}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2008}, month = {03/2008}, publisher = {COSYNE}, address = {Salt Lake City, Utah, USA}, abstract = {

Neural coding schemes that minimize the number of neurons activated at any one time (or equivalently maximize \“sparseness\”) are thought to be both metabolically and computationally efficient [reviewed by 1]. But does sparseness increase as signals propagate through the cortex? To investigate this question, we compared the response properties of neurons at different stages along the pathway supporting object recognition, the ventral visual stream. Specifically, we recorded the responses of neurons in a mid-level visual area (V4) and a high-level visual area (anterior inferotemporal cortex, IT) to a large set of natural images while monkeys performed an object detection task. We found that the distributions of sparseness values in V4 and IT were indistinguishable and that most neurons in both areas were broadly tuned. Similarly, individual images activated the same, large fraction of neurons in V4 and IT. Thus it appears that a coding principle is conserved at each level of processing; however, in opposition to theories of sparse coding, the tuning we observed in mid- and high-level vision is more consistent with a broadly distributed coding scheme [see also 2].

If sparseness is not changing, what is happening as signals propagate through the visual system? We began investigating this question by measuring tolerance to position and scale transformations. We found that individual neuron tolerance increased from V4 to IT, and this translated to enhanced performance of the IT over the V4 population on a position- and scale-invariant object recognition task. To determine whether the neurons in each area also differ in terms of the image features that elicit a response, we presented natural and \“scrambled\” images that have the same local structure but configured randomly [3]. We found that V4 neurons responded similarly to both image sets whereas IT neurons responded much more robustly to the natural images. Likewise, the V4 population discriminated between members of the two image sets with similar fidelity whereas discrimination by the IT population was considerably degraded for the scrambled as compared to the natural images. These results suggest that IT neurons are more selective than V4 neurons in terms of the image features that drive these cells. Moreover, we found that equivalent sparseness values were correlated with higher levels of selectivity and tolerance in IT as compared to V4. Thus, as signals propagate through the visual system, neurons increase their selectivity for particular image features and, at the same time, neurons increase their tolerance for the position and scale of those features; the rates at which these two factors increase are set such that constant sparseness is maintained at each level of visual processing. Consistent with the observation that the structure of cortex is roughly identical regardless of where it sits in the hierarchy, we speculate that conservation of a broadly distributed coding scheme is an optimal use of resources in equipotential cortex.

References

[1] Sparse coding of sensory inputs. BA Olshausen and DJ Field, Curr Opin Neurobiol., 14:481-7, 2004. [2]Responses of neurons in primary and inferior temporal visual cortices to natural scenes. R Baddeley et al., Proc Biol Sci. 264:1775-83, 1997. [3] A parametric texture model based on joint statistics of complex wavelet coefficients. J Portilla and EP Simoncelli, Int J Comp Vis, 40:49-71, 2000

}, url = {http://www.cosyne.org/c/images/8/8e/Cosyne_pf_new.pdf}, author = {Rust, N. C. and DiCarlo, James J.} } @article {68, title = {Does Learned Shape Selectivity in Inferior Temporal Cortex Automatically Generalize Across Retinal Position?}, journal = {Journal of Neuroscience}, volume = {28}, year = {2008}, month = {01/2008}, pages = {10045 - 10055}, abstract = {

Biological visual systems have the remarkable ability to recognize objects despite confounding factors such as object position, size, pose, and lighting. In primates, this ability likely results from neuronal responses at the highest stage of the ventral visual stream [inferior temporal cortex {(IT)]} that signal object identity while tolerating these factors. However, for even the apparently simplest {IT} tolerance (\"invariance\"), tolerance to object position on the retina, little is known about how this feat is achieved. One possibility is that {IT} position tolerance is innate in that discriminatory power for newly learned objects automatically generalizes across position. Alternatively, visual experience plays a role in developing position tolerance. To test these ideas, we trained adult monkeys in a difficult object discrimination task in which their visual experience with novel objects was restricted to a single retinal position. After training, we recorded the spiking activity of an unbiased population of {IT} neurons and found that it contained significantly greater selectivity among the newly learned objects at the experienced position compared with a carefully matched, non-experienced position. Interleaved testing with other objects shows that this difference cannot be attributed to a bias in spatial attention or neuronal sampling. We conclude from these results that, at least under some conditions, full transfer of {IT} neuronal selectivity across retinal position is not automatic. This finding raises the possibility that visual experience plays a role in building neuronal tolerance in the ventral visual stream and the recognition abilities it supports.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.2142-08.2008}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.2142-08.2008}, author = {Cox, D. D. and DiCarlo, J. J.} } @conference {70, title = {Establishing Good Benchmarks and Baselines for Face Recognition}, booktitle = {European Conference on Computer Vision-Faces in {\textquoteright}Real-Life{\textquoteright} Images Workshop}, year = {2008}, month = {10/2008}, publisher = {EECV}, organization = {EECV}, address = {Marseille, France}, abstract = {

Progress in face recognition relies critically on the creation of test sets against which the performance of various approaches can be evaluated. A good set must capture the essential elements of what makes the problem hard, while conforming to practical scale limitations. However, these goals are often deceptively difficult to achieve. In the related area of object recognition, Pinto et al. [2] demonstrated the potential dangers of using a large, uncontrolled natural image set, showing that an extremely rudimentary vision system (inspired by the early stages of visual processing in the brain) was able to perform on par with many state-of-the-art vision systems on the popular Caltech101 object set [3].\  At the same time, this same rudimentary system was easily defeated by an ostensibly \"simpler\" synthetic recognition test designed to better span the range of real world variation in object pose, position, scale, etc. These results suggested that image sets that look \"natural\" to human observers may nonetheless fail to properly embody the problem of interest, and that care must be taken to establish baselines against which performance can be judged. Here, we repeat this approach for the \"LabeledFaces in the Wild\" (LFW) dataset [1], and for a collection of standard face recognition tests. The goal of the present work is not to compete in the LFW challenge, per se, but to provide a baseline against which the performance of other systems can be judged. In particular, we found that our rudimentary \"baseline\" vision system was able to achieve 68\% correct performance on the LFW challenge, substantially higher than a pure chance\" baseline. We argue that this value might serve as a more useful baseline against which to evaluate absolute performance and argue that the LFW set, while perhaps not perfect, represents an improvement over other standard face sets.

}, author = {Pinto, Nicolas and DiCarlo, James J. and Cox, David D.} } @article {54, title = {Fine-Scale Spatial Organization of Face and Object Selectivity in the Temporal Lobe: Do Functional Magnetic Resonance Imaging, Optical Imaging, and Electrophysiology Agree?}, journal = {Journal of Neuroscience}, volume = {28}, year = {2008}, month = {12/2008}, pages = {11796 - 11801}, abstract = {

The spatial organization of the brain\&$\#$39;s object and face representations in the temporal lobe is critical for understanding high-level vision and cognition but is poorly understood. Recently, exciting progress has been made using advanced imaging and physiology methods in humans and nonhuman primates, and the combination of such methods may be particularly powerful. Studies applying these methods help us to understand how neuronal activity, optical imaging, and functional magnetic resonance imaging signals are related within the temporal lobe, and to uncover the fine-grained and large-scale spatial organization of object and face representations in the primate brain.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.3799-08.2008}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.3799-08.2008}, author = {Op de Beeck, H. P. and DiCarlo, J. J. and Goense, J. B. M. and Grill-Spector, K. and Papanastassiou, A. and Tanifuji, M. and Tsao, D. Y.} } @article {64, title = {High-Resolution Three-Dimensional Microelectrode Brain Mapping Using Stereo Microfocal X-ray Imaging}, journal = {Journal of Neurophysiology}, volume = {100}, year = {2008}, month = {01/2008}, pages = {2966 - 2976}, abstract = {

Much of our knowledge of brain function has been gleaned from studies using microelectrodes to characterize the response properties of individual neurons in vivo. However, because it is difficult to accurately determine the location of a microelectrode tip within the brain, it is impossible to systematically map the fine three-dimensional spatial organization of many brain areas, especially in deep structures. Here, we present a practical method based on digital stereo microfocal X-ray imaging that makes it possible to estimate the three-dimensional position of each and every microelectrode recording site in \"real time\" during experimental sessions. We determined the system\&$\#$39;s ex vivo localization accuracy to be better than 50 microm, and we show how we have used this method to coregister hundreds of deep-brain microelectrode recordings in monkeys to a common frame of reference with median error of {\textless}150 microm. We further show how we can coregister those sites with magnetic resonance images {(MRIs),} allowing for comparison with anatomy, and laying the groundwork for more detailed electrophysiology/functional {MRI} comparison. Minimally, this method allows one to marry the single-cell specificity of microelectrode recording with the spatial mapping abilities of imaging techniques; furthermore, it has the potential of yielding fundamentally new kinds of high-resolution maps of brain function.

}, issn = {0022-3077}, doi = {10.1152/jn.90672.2008}, url = {https://www.physiology.org/doi/10.1152/jn.90672.2008}, author = {Cox, David D. and Papanastassiou, Alexander M. and Oreper, Daniel and Andken, Benjamin B. and DiCarlo, James J.} } @proceedings {71, title = {A high-throughput screening approach to discovering good forms of visual representation}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2008}, month = {03/2008}, address = {Salt Lake City, Utah, USA}, author = {Cox, David D. and Pinto, Nicolas and Doukhan, David and Corda, B and DiCarlo, James J.} } @proceedings {67, title = {Increases in selectivity are offset by increases in tolerance ("invariance") to maintain sparseness across the ventral visual pathway}, journal = {Society for Neuroscience}, year = {2008}, month = {11/2008}, pages = {514.8}, publisher = {SFN}, address = {Washington, DC, USA}, abstract = {

Popular accounts of visual processing suggest that neurons become increasingly selective for particular objects and scenes as signals propagate through the ventral visual pathway, but this hypothesis has proven difficult to test. To investigate this issue systematically, we recorded the responses of neurons in a mid-level visual area (V4) and a high-level visual area (IT) under well-controlled conditions (same monkeys, same task, same region of the visual field, same stimuli). We assessed the selectivity of neurons in each area by determining how well each population could discriminate between natural images and \“scrambled\” versions of those images that have the same local structure but configured randomly. We found that the V4 population discriminated between members of the two image sets with similar fidelity whereas discrimination by the IT population was considerably degraded for the scrambled as compared to the natural images. These results suggest that IT neurons are more selective than V4 neurons in terms of the image features that drive these cells. As a second estimate of selectivity, we measured the tuning bandwidth of neurons for natural images (commonly called \“sparseness\”). Surprisingly, we found that distributions of sparseness values were indistinguishable between V4 and IT. Similarly, we found that individual images activated the same fraction of neurons in V4 and IT, suggesting that a coding principle is conserved at each level of processing. How can the selectivity for natural image features increase while the tuning bandwidth for natural images remains constant? One possible explanation is that increases in selectivity for particular image features are offset by increases in tolerance for the (e.g.) position and scale of those features. Indeed, when we measured the tolerance of neurons to changes in the position and scale of images, we found that tolerance increases as signals propagate from V4 to IT. Moreover, we found that equivalent sparseness values were correlated with higher levels of selectivity and tolerance in IT as compared to V4. These results confirm that neurons increase both their selectivity for image features and their tolerance to changes in the position and scale of those features as signals propagate through the ventral visual pathway. Remarkably, the rates of increase of these two parameters appear to be set such that an equally distributed coding scheme is maintained at each level of visual processing.

}, url = {https://www.abstractsonline.com/Plan/ViewAbstract.aspx?sKey=fc0d0a2d-b563-4b41-8311-0805f08bde8a\&cKey=8a2c998e-bc76-4d92-96ac-ad5199da59bf\&mKey=\%7bAFEA068D-D012-4520-8E42-10E4D1AF7944\%7d}, author = {Rust, N. C. and James J. DiCarlo} } @proceedings {72, title = {Inferior temporal cortex robustly signals encounters with new objects, but is not an online representation of the visual world}, journal = {Society for Neuroscience}, year = {2008}, month = {11/2008}, pages = {316.6}, publisher = {SFN}, address = {Washington, DC, USA}, abstract = {

Popular accounts of visual processing suggest that neurons become increasingly selective for particular objects and scenes as signals propagate through the ventral visual pathway, but this hypothesis has proven difficult to test. To investigate this issue systematically, we recorded the responses of neurons in a mid-level visual area (V4) and a high-level visual area (IT) under well-controlled conditions (same monkeys, same task, same region of the visual field, same stimuli). We assessed the selectivity of neurons in each area by determining how well each population could discriminate between natural images and \“scrambled\” versions of those images that have the same local structure but configured randomly. We found that the V4 population discriminated between members of the two image sets with similar fidelity whereas discrimination by the IT population was considerably degraded for the scrambled as compared to the natural images. These results suggest that IT neurons are more selective than V4 neurons in terms of the image features that drive these cells. As a second estimate of selectivity, we measured the tuning bandwidth of neurons for natural images (commonly called \“sparseness\”). Surprisingly, we found that distributions of sparseness values were indistinguishable between V4 and IT. Similarly, we found that individual images activated the same fraction of neurons in V4 and IT, suggesting that a coding principle is conserved at each level of processing. How can the selectivity for natural image features increase while the tuning bandwidth for natural images remains constant? One possible explanation is that increases in selectivity for particular image features are offset by increases in tolerance for the (e.g.) position and scale of those features. Indeed, when we measured the tolerance of neurons to changes in the position and scale of images, we found that tolerance increases as signals propagate from V4 to IT. Moreover, we found that equivalent sparseness values were correlated with higher levels of selectivity and tolerance in IT as compared to V4. These results confirm that neurons increase both their selectivity for image features and their tolerance to changes in the position and scale of those features as signals propagate through the ventral visual pathway. Remarkably, the rates of increase of these two parameters appear to be set such that an equally distributed coding scheme is maintained at each level of visual processing.

}, url = {https://www.abstractsonline.com/Plan/ViewAbstract.aspx?sKey=ee83e7f7-5aea-4ec8-a948-436658d20e37\&cKey=fc64f0af-c81e-4b0e-b809-796349279531\&mKey=\%7bAFEA068D-D012-4520-8E42-10E4D1AF7944\%7d}, author = {Rust, N. C. and DiCarlo, James J.} } @proceedings {74, title = {Natural experience drives online learning of tolerant object representations in visual cortex}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2008}, month = {03/2008}, publisher = {COSYNE}, address = {Salt Lake City, Utah, USA}, abstract = {

Object recognition is computationally challenging because each object produces a myriad of retinal images. Yet the visual system somehow solves it effortlessly. Neuronal responses at the top of the primate ventral visual stream (inferior temporal cortex; IT) have a key response property that likely underlies this ability -- they are selective among visual objects, yet tolerant to changes in object position, size, pose, lighting, etc. How this tolerant selectivity is constructed remains a fundamental mystery. One possibility is that the visual system builds that tolerance via the spatiotemporal statistics of natural visual experience. Because objects are typically present for relatively long time intervals, while object motion or viewer motion (e.g. eye movements) cause rapid changes in each object\’s retinal image, the ventral visual stream could construct tolerance by associating neuronal representations that occur closely in time. If this hypothesis is correct, then we might create \“incorrect\” tolerance by targeted manipulation of these spatiotemporal statistics. Specifically, if we engineered an altered visual world in which some objects consistently changed identity across retinal position then, following sufficient exposure to this world, the visual system might incorrectly associate the representations of those objects at those positions. The main prediction is that individual IT neurons would lose their normal position-tolerance (i.e. object preference maintained across retinal position), and would instead tend to prefer one object at one position, and another object at the other position (see figure). We monitored single IT neurons\’ position-tolerance in two monkeys while they visually explored our altered visual world. We used real-time eye tracking to present visual objects at controlled retinal positions during free viewing: as the animal saccaded toward a specific object (A), it was consistently replaced by another object (B). This manipulation caused the image of object A at a peripheral retinal position (\"swapped\") to be consistently temporally associated with the image of object B on the fovea. Remarkably, while each animal explored this altered world, its IT neurons gradually began to reverse their object preferences at the swapped position, exactly as predicted. This effect continued to get larger for as long as we could hold neurons (~1 hour), it was specific for object position (counterbalanced across neurons) and object identity, and it cannot be explained by adaptation. We have previously found that similar manipulations of experience produce changes in the positiontolerance of human object perception [1]. Taken together, our results suggest that the ventral visual stream acquires and maintains a tolerant object representation via the spatiotemporal statistics of natural visual experience, without external supervision. The relatively fast time-scale of this unsupervised learning opens the door to rapid advances in characterizing the crucial spatiotemporal image statistics, understanding other types of tolerance (e.g. size, pose), and ultimately connecting a central cognitive ability -- tolerant object recognition -- to cellular and molecular plasticity mechanisms.

References

[1] \‘Breaking\’ position-invariant object recognition. Cox DD, Meier P, Oertelt N, and DiCarlo JJ, Nature Neuroscience 8:1145-1147, 2005.

}, url = {http://www.cosyne.org/c/images/8/8e/Cosyne_pf_new.pdf}, author = {Li, Nuo and DiCarlo, James J.} } @proceedings {73, title = {Is the rodent a valuable model system for studying invariant object recognition?}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2008}, month = {03/2008}, publisher = {COSYNE}, address = {Salt Lake City, Utah, USA}, abstract = {

Despite the many advantages rodents offer in terms of experimental accessibility, they have never been extensively used as models to investigate the neuronal processing of visual objects. A crucial step in establishing whether rodents are suitable models for the study of object vision, is to assess if they are capable of invariant object recognition \– i.e., recognition of visual objects across the range of transformations that they typically undergo during natural vision (e.g., changes in position and size). In this study, we tested rats (Long-Evans) capability to discriminate between two geometrical shapes (square and triangle) presented at different sizes, positions and orientations. In each trial, a single shape was presented on a computer monitor and the animal had to report its identity. The experiment consisted of three phases. In phase I, each animal learned to discriminate between the two shapes presented at fixed size (40\° of visual angle), position (center of the monitor), and orientation. Na\ïve rats typically achieved \> 70\% correct performance in 2-3 weeks of training. In phase II, the animals were trained to perform the task while either the size, the horizontal position, or the orientation of the shapes was separately varied. Rats readily acquired this task (\> 70\% correct performance) across sizes ranging from 40\° to 10\°, positions spanning \±12\°, and orientations spanning \±40\°. During phase III of the experiment, we asked if rats could generalize to novel combinations of size, position, and orientation (a total of 100 transformations of each target shape were tested). Performance was typically \> 70\% correct for nearly all of these previously unseen transformations (performance of one rat over a subset of the tested conditions is shown in the figure) and even for a fraction of transformations for which feedback was withheld. These results show that rats can readily learn to: 1) discriminate between simple visual shapes; 2) disregard variations in their position, size and orientation; and 3) generalize to novel views. This suggests that the rat visual system contains the fundamental mechanisms that support object recognition. Therefore, given the broad ranges of experimental approaches available in rats, they may be a powerful new model system to study the neuronal basis of invariant object recognition

}, url = {http://www.cosyne.org/c/images/8/8e/Cosyne_pf_new.pdf}, author = {Zoccolan, Davide and Cox, David D. and Oertelt, Nadja and Radwam, Basma and Tsang, Sabrina and DiCarlo, James J.} } @article {63, title = {Unsupervised Natural Experience Rapidly Alters Invariant Object Representation in Visual Cortex}, journal = {Science}, volume = {321}, year = {2008}, month = {12/2008}, pages = {1502 - 1507}, abstract = {

Object recognition is challenging because each object produces myriad retinal images. Responses of neurons from the inferior temporal cortex {(IT)} are selective to different objects, yet tolerant (\"invariant\") to changes in object position, scale, and pose. How does the brain construct this neuronal tolerance? We report a form of neuronal learning that suggests the underlying solution. Targeted alteration of the natural temporal contiguity of visual experience caused specific changes in {IT} position tolerance. This unsupervised temporal slowness learning {(UTL)} was substantial, increased with experience, and was significant in single {IT} neurons after just 1 hour. Together with previous theoretical work and human object perception experiments, we speculate that {UTL} may reflect the mechanism by which the visual stream builds and maintains tolerant object representations.

}, issn = {0036-8075}, doi = {10.1126/science.1160028}, url = {http://www.sciencemag.org/cgi/content/full/321/5895/1502?ijkey=wb6T4x69JeSes\&keytype=ref\&siteid=sci}, author = {Li, Nuo and DiCarlo, James J.} } @proceedings {76, title = {Unsupervised natural experience rapidly alters invariant object representation in visual cortex}, journal = {Society for Neuroscience}, year = {2008}, month = {11/2008}, pages = {316.5}, publisher = {SFN}, address = {Washington, DC, USA}, abstract = {

The responses of cortical neurons at the top of the ventral visual stream -- inferior temporal (IT) cortex -- are selective to visual objects, yet tolerant (\“invariant\”) to changes in object position, size, pose, etc. Though IT responses likely underlie object recognition behavior, how that neuronal tolerance is constructed remains a fundamental mystery. One possibility is that natural visual experience is an implicit teacher: because objects are present for relatively long time intervals, while object motion or viewer motion (e.g. eye movements) cause rapid changes in each object\’s retinal image, the visual system could learn tolerance by associating neuronal representations that occur closely in time. If this hypothesis is correct, then we might create \“incorrect\” tolerance by engineering an altered visual world in which we temporally couple the retinal images of two different objects at different retinal positions. The main prediction is that the visual system would incorrectly associate the representations of those objects at those positions. Thus, IT neurons might lose their position tolerant selectivity, and instead begin to prefer one object at one position and another object at the other position. To test this idea, two monkeys visually explored our altered visual world and we used real-time eye tracking to present visual objects at controlled retinal positions during free viewing. As the animal saccaded toward a specific object (P), it was consistently replaced by another object (N), rendering the image of P at a peripheral retinal position (\“swapped\”) temporally contiguous with the image of N on the fovea. We found that exposure to these altered statistics changed IT object selectivity specifically at the swapped position, as predicted. This unsupervised temporal tolerance learning (UTL) was substantial (~5 spk/s selectivity change in 1 hr), gradually increased with exposure, and was highly significant at the population level (p=0.007 \“position x exposure\” interaction, bootstrap). Coupled with the finding that this same experience manipulation changes the position tolerance of human object perception (Cox et al, 2005), we speculate that UTL may reflect the mechanism by which the visual system builds and maintains tolerant object representations. The relatively fast time-scale and unsupervised nature of UTL open the door to advances in systematically characterizing the spatiotemporal image statistics that drive it, understanding if it plays a role in other types of tolerance (e.g. pose, scale), and perhaps connecting a central cognitive ability -- tolerant object recognition -- to cellular and molecular plasticity mechanisms.

}, url = {https://www.abstractsonline.com/Plan/ViewAbstract.aspx?sKey=ee83e7f7-5aea-4ec8-a948-436658d20e37\&cKey=9a873eb3-f8d3-48b5-8df9-9f7b2ef0a3d9\&mKey=\%7bAFEA068D-D012-4520-8E42-10E4D1AF7944\%7d}, author = {Li, Nuo and DiCarlo, James J.} } @proceedings {77, title = {Why is real-world object recognition hard?: Establishing honest benchmarks and baselines for object recognition}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2008}, month = {03/2008}, publisher = {COSYNE}, address = {Salt Lake City, Utah, USA}, author = {Pinto, Nicolas and Cox, David D. and Corda, B and Doukhan, David and DiCarlo, James J.} } @article {65, title = {Why is Real-World Visual Object Recognition Hard?}, journal = {PLoS Computational Biology}, volume = {4}, year = {2008}, month = {01/2008}, pages = {e27}, abstract = {

Progress in understanding the brain mechanisms underlying vision requires the construction of computational models that not only emulate the brain\&$\#$39;s anatomy and physiology, but ultimately match its performance on visual tasks. In recent years, \"natural\" images have become popular in the study of vision and have been used to show apparently impressive progress in building such models. Here, we challenge the use of uncontrolled \"natural\" images in guiding that progress. In particular, we show that a simple V1-like model\–a neuroscientist\&$\#$39;s \"null\" model, which should perform poorly at real-world visual object recognition tasks\–outperforms state-of-the-art object recognition systems (biologically inspired and otherwise) on a standard, ostensibly natural image recognition test. As a counterpoint, we designed a \"simpler\" recognition test to better span the real-world variation in object pose, position, and scale, and we show that this test correctly exposes the inadequacy of the V1-like model. Taken together, these results demonstrate that tests based on uncontrolled natural images can be seriously misleading, potentially guiding progress in the wrong direction. Instead, we reexamine what it means for images to be natural and argue for a renewed focus on the core problem of object recognition\–real-world image variation.

}, doi = {10.1371/journal.pcbi.0040027}, url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.0040027}, author = {Pinto, Nicolas and Cox, David D and DiCarlo, James J}, editor = {Friston, Karl J} } @article {75, title = {A Stable Topography of Selectivity for Unfamiliar Shape Classes in Monkey Inferior Temporal Cortex}, journal = {Cerebral Cortex}, volume = {18}, year = {2007}, month = {2007}, pages = {1676 - 1694}, abstract = {

The inferior temporal {(IT)} cortex in monkeys plays a central role in visual object recognition and learning. Previous studies have observed patches in {IT} cortex with strong selectivity for highly familiar object classes (e.g., faces), but the principles behind this functional organization are largely unknown due to the many properties that distinguish different object classes. To unconfound shape from meaning and memory, we scanned monkeys with functional magnetic resonance imaging while they viewed classes of initially novel objects. Our data revealed a topography of selectivity for these novel object classes across {IT} cortex. We found that this selectivity topography was highly reproducible and remarkably stable across a 3-month interval during which monkeys were extensively trained to discriminate among exemplars within one of the object classes. Furthermore, this selectivity topography was largely unaffected by changes in behavioral task and object retinal position, both of which preserve shape. In contrast, it was strongly influenced by changes in object shape. The topography was partially related to, but not explained by, the previously described pattern of face selectivity. Together, these results suggest that {IT} cortex contains a large-scale map of shape that is largely independent of meaning, familiarity, and behavioral task.

}, issn = {1047-3211}, doi = {10.1093/cercor/bhm196}, url = {https://academic.oup.com/cercor/article-lookup/doi/10.1093/cercor/bhm196}, author = {Op de Beeck, Hans P. and Deutsch, Jennifer A. and Vanduffel, Wim and Kanwisher, Nancy G. and DiCarlo, James J.} } @article {78, title = {Trade-Off between Object Selectivity and Tolerance in Monkey Inferotemporal Cortex}, journal = {Journal of Neuroscience}, volume = {27}, year = {2007}, month = {07/2007}, pages = {12292 - 12307}, abstract = {

Object recognition requires both selectivity among different objects and tolerance to vastly different retinal images of the same object, resulting from natural variation in (e.g.) position, size, illumination, and clutter. Thus, discovering neuronal responses that have object selectivity and tolerance to identity-preserving transformations is fundamental to understanding object recognition. Although selectivity and tolerance are found at the highest level of the primate ventral visual stream [the inferotemporal cortex {(IT)],} both properties are highly varied and poorly understood. If an {IT} neuron has very sharp selectivity for a unique combination of object features (\"diagnostic features\"), this might automatically endow it with high tolerance. However, this relationship cannot be taken as given; although some {IT} neurons are highly object selective and some are highly tolerant, the empirical connection of these key properties is unknown. In this study, we systematically measured both object selectivity and tolerance to different identity-preserving image transformations in the spiking responses of a population of monkey {IT} neurons. We found that {IT} neurons with high object selectivity typically have low tolerance (and vice versa), regardless of how object selectivity was quantified and the type of tolerance examined. The discovery of this trade-off illuminates object selectivity and tolerance in {IT} and unifies a range of previous, seemingly disparate results. This finding also argues against the idea that diagnostic conjunctions of features guarantee tolerance. Instead, it is naturally explained by object recognition models in which object selectivity is built through {AND-like} tuning mechanisms.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.1897-07.2007}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.1897-07.2007}, author = {Zoccolan, D. and Kouh, M. and Poggio, T. and DiCarlo, J. J.} } @article {79, title = {Untangling invariant object recognition}, journal = {Trends in Cognitive Sciences}, volume = {11}, year = {2007}, month = {01/2007}, pages = {333 - 341}, abstract = {

Despite tremendous variation in the appearance of visual objects, primates can recognize a multitude of objects, each in a fraction of a second, with no apparent effort. However, the brain mechanisms that enable this fundamental ability are not understood. Drawing on ideas from neurophysiology and computation, we present a graphical perspective on the key computational challenges of object recognition, and argue that the format of neuronal population representation and a property that we term \&$\#$39;object tangling\&$\#$39; are central. We use this perspective to show that the primate ventral visual processing stream achieves a particularly effective solution in which single-neuron invariance is not the goal. Finally, we speculate on the key neuronal mechanisms that could enable this solution, which, if understood, would have far-reaching implications for cognitive neuroscience.

}, issn = {13646613}, doi = {10.1016/j.tics.2007.06.010}, url = {https://linkinghub.elsevier.com/retrieve/pii/S1364661307001593}, author = {DiCarlo, James J. and Cox, David D.} } @article {82, title = {Discrimination Training Alters Object Representations in Human Extrastriate Cortex}, journal = {Journal of Neuroscience}, volume = {26}, year = {2006}, month = {2006}, pages = {13025 - 13036}, abstract = {

Visual object recognition relies critically on learning. However, little is known about the effect of object learning in human visual cortex, and in particular how the spatial distribution of training effects relates to the distribution of object and face selectivity across the cortex before training. We scanned human subjects with high-resolution functional magnetic resonance imaging {(fMRI)} while they viewed novel object classes, both before and after extensive training to discriminate between exemplars within one of these object classes. Training increased the strength of the response in visual cortex to trained objects compared with untrained objects. However, training did not simply induce a uniform increase in the response to trained objects: the magnitude of this training effect varied substantially across subregions of extrastriate cortex, with some showing a twofold increase in response to trained objects and others (including the right fusiform face area) showing no significant effect of training. Furthermore, the spatial distribution of training effects could not be predicted from the spatial distribution of either pretrained responses or face selectivity. Instead, training changed the spatial distribution of activity across the cortex. These findings support a dynamic view of the ventral visual pathway in which the cortical representation of an object category is continuously modulated by experience.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.2481-06.2006}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.2481-06.2006}, author = {Op de Beeck, H. P. and Baker, C. I. and DiCarlo, J. J. and Kanwisher, N. G.} } @proceedings {85, title = {Flexible and robust object recognition in inferior temporal cortex supported by neurons with limited position and clutter tolerance}, journal = {Society for Neuroscience}, year = {2006}, month = {10/2006}, publisher = {SFN}, address = {Atlanta, GA, USA}, author = {Li, N. and Cox, David D. and Zoccolan, Davide and DiCarlo, James J.} } @proceedings {84, title = {A large-scale shape map in monkey inferior temporal cortex}, journal = {Society for Neuroscience}, year = {2006}, month = {10/2006}, publisher = {SFN}, address = {Atlanta, GA, USA}, author = {Op de Beeck, H. P. and Deutsch, Jennifer A. and Vanduffel, Wim and Nancy Kanwisher and DiCarlo, James J.} } @article {83, title = {Learning and neural plasticity in visual object recognition}, journal = {Current Opinion in Neurobiology}, volume = {16}, year = {2006}, month = {01/2006}, pages = {152 - 158}, abstract = {

The capability of the adult primate visual system for rapid and accurate recognition of targets in cluttered, natural scenes far surpasses the abilities of state-of-the-art artificial vision systems. Understanding this capability remains a fundamental challenge in visual neuroscience. Recent experimental evidence suggests that adaptive coding strategies facilitated by underlying neural plasticity enable the adult brain to learn from visual experience and shape its ability to integrate and recognize coherent visual objects.

}, issn = {09594388}, doi = {10.1016/j.conb.2006.03.012}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438806000377}, author = {Kourtzi, Zoe and DiCarlo, James J} } @article {81, title = {Making faces in the brain}, journal = {Nature}, volume = {442}, year = {2006}, month = {01/2006}, pages = {644 - 644}, type = {News \& Views}, abstract = {

Artificially activating the right neurons at the right time causes visual perception of a face. This new result shows that such neurons directly underlie the recognition of complex objects.

}, issn = {0028-0836}, doi = {10.1038/nature05000}, url = {http://www.nature.com/articles/nature05000}, author = {DiCarlo, James J.} } @article {80, title = {Object Selectivity of Local Field Potentials and Spikes in the Macaque Inferior Temporal Cortex}, journal = {Neuron}, volume = {49}, year = {2006}, month = {01/2006}, pages = {433 - 445}, abstract = {

Local field potentials (LFPs) arise largely from dendritic activity over large brain regions and thus provide a measure of the input to and local processing within an area. We characterized (LFPs) and their relationship to spikes (multi and single unit) in monkey inferior temporal cortex (IT). LFP responses in IT to complex objects showed strong selectivity at 44\% of the sites and tolerance to retinal position and size. The LFP preferences were poorly predicted by the spike preferences at the same site but were better explained by averaging spikes within approximately 3 mm. A comparison of separate sites suggests that selectivity is similar on a scale of approximately 800 microm for spikes and approximately 5 mm for LFPs. These observations imply that inputs to IT neurons convey selectivity for complex shapes and that such input may have an underlying organization spanning several millimeters.

}, issn = {08966273}, doi = {10.1016/j.neuron.2005.12.019}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0896627305011311}, author = {Kreiman, Gabriel and Hung, Chou P. and Kraskov, Alexander and Quiroga, Rodrigo Quian and Poggio, Tomaso and DiCarlo, James J.} } @proceedings {86, title = {Trade-off between shape selectivity and tolerance to identity-preserving transformations in monkey inferotemporal cortex}, journal = {Gordon Conference: Sensation and the Natural Environment}, year = {2006}, month = {2006}, address = {Bozeman, MT, USA}, author = {Zoccolan, Davide and Kouh, M and Poggio, Tomaso and DiCarlo, James J.} } @proceedings {91, title = {Is the {\textquotedblleft}binding problem{\textquotedblright} a problem in inferiotemporal cortex?}, journal = {Society for Neuroscience}, year = {2005}, month = {11/2005}, publisher = {SFN}, address = {Washington, DC, USA}, author = {Cox, David D. and DiCarlo, James J.} } @article {90, title = {{\textquoteright}Breaking{\textquoteright} position-invariant object recognition}, journal = {Nature Neuroscience}, volume = {8}, year = {2005}, month = {08/2005}, pages = {1145 - 1147}, abstract = {

While it is often assumed that objects can be recognized irrespective of where they fall on the retina, little is known about the mechanisms underlying this ability. By exposing human subjects to an altered world where some objects systematically changed identity during the transient blindness that accompanies eye movements, we induced predictable object confusions across retinal positions, effectively \&$\#$39;breaking\&$\#$39; position invariance. Thus, position invariance is not a rigid property of vision but is constantly adapting to the statistics of the environment.

}, issn = {1097-6256}, doi = {10.1038/nn1519}, url = {http://www.nature.com/articles/nn1519.pdf}, author = {Cox, David D and Meier, Philip and Oertelt, Nadja and DiCarlo, James J} } @article {88, title = {Fast Readout of Object Identity from Macaque Inferior Temporal Cortex}, journal = {Science}, volume = {310}, year = {2005}, month = {04/2005}, pages = {863 - 866}, abstract = {

Understanding the brain computations leading to object recognition requires quantitative characterization of the information represented in inferior temporal (IT) cortex. We used a biologically plausible, classifier-based readout technique to investigate the neural coding of selectivity and invariance at the IT population level. The activity of small neuronal populations (approximately 100 randomly selected cells) over very short time intervals (as small as 12.5 milliseconds) contained unexpectedly accurate and robust information about both object \"identity\" and \"category.\" This information generalized over a range of object positions and scales, even for novel objects. Coarse information about position and scale could also be read out from the same population.

}, keywords = {Action Potentials, Animals, Brain Mapping, Macaca mulatta, Neurons, Psychology, Psychomotor Performance, Recognition, Temporal Lobe, Time Factors, Visual Perception}, issn = {0036-8075}, doi = {10.1126/science.1117593}, url = {https://www.sciencemag.org/lookup/doi/10.1126/science.1117593}, author = {Hung, Chou P. and Kreiman, Gabriel and Poggio, Tomaso and DiCarlo, James J.} } @article {87, title = {Multiple Object Response Normalization in Monkey Inferotemporal Cortex}, journal = {Journal of Neuroscience}, volume = {25}, year = {2005}, month = {07/2005}, pages = {8150 - 8164}, abstract = {

The highest stages of the visual ventral pathway are commonly assumed to provide robust representation of object identity by disregarding confounding factors such as object position, size, illumination, and the presence of other objects (clutter). However, whereas neuronal responses in monkey inferotemporal cortex (IT) can show robust tolerance to position and size changes, previous work shows that responses to preferred objects are usually reduced by the presence of nonpreferred objects. More broadly, we do not yet understand multiple object representation in IT. In this study, we systematically examined IT responses to pairs and triplets of objects in three passively viewing monkeys across a broad range of object effectiveness. We found that, at least under these limited clutter conditions, a large fraction of the response of each IT neuron to multiple objects is reliably predicted as the average of its responses to the constituent objects in isolation. That is, multiple object responses depend primarily on the relative effectiveness of the constituent objects, regardless of object identity. This average effect becomes virtually perfect when populations of IT neurons are pooled. Furthermore, the average effect cannot simply be explained by attentional shifts but behaves as a primarily feedforward response property. Together, our observations are most consistent with mechanistic models in which IT neuronal outputs are normalized by summed synaptic drive into IT or spiking activity within IT and suggest that normalization mechanisms previously revealed at earlier visual areas are operating throughout the ventral visual stream.

}, keywords = {Animals, Brain Mapping, Macaca mulatta, Male, Photic Stimulation, Posture, Psychology, Recognition, Temporal Lobe, Visual Pathways, Visual Perception}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.2058-05.2005}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.2058-05.2005}, author = {Zoccolan, D. and Cox, David D. and DiCarlo, James J.} } @proceedings {92, title = {Multiple object response normalization in monkey inferotemporal cortex}, journal = {Society for Neuroscience}, year = {2005}, month = {11/2005}, publisher = {SFN}, address = {Washington, DC, USA}, author = {Zoccolan, Davide and Cox, David D. and DiCarlo, James J.} } @article {93, title = {Ultra-fast object recognition from few spikes}, number = {2005-022}, year = {2005}, month = {07/2005}, pages = {1-31}, institution = {MIT}, address = {Cambridge, MA}, abstract = {

Understanding the complex brain computations leading to object recognition requires quantitatively characterizing the information represented in inferior temporal cortex (IT), the highest stage of the primate visual stream. A read-out technique based on a trainable classifier is used to characterize the neural coding of selectivity and invariance at the population level. The activity of very small populations of independently recorded IT neurons (~100 randomly selected cells) over very short time intervals (as small as 12.5 ms) contains surprisingly accurate and robust information about both object \Â\‘identity\Â\’ and \Â\‘category\Â\’, which is furthermore highly invariant to object position and scale. Significantly, selectivity and invariance are present even for novel objects, indicating that these properties arise from the intrinsic circuitry and do not require object-specific learning. Within the limits of the technique, there is no detectable difference in the latency or temporal resolution of the IT information supporting so-called \Â\‘categorization\Â\’ (a.k. basic level) and \Â\‘identification\Â\’ (a.k. subordinate level) tasks. Furthermore, where information, in particular information about stimulus location and scale, can also be read-out from the same small population of IT neurons. These results show how it is possible to decode invariant object information rapidly, accurately and robustly from a small population in IT and provide insights into the nature of the neural code for different kinds of object-related information.

}, keywords = {AI, inferior temporal cortex, neural coding, object recognition}, issn = {2005-022}, url = {https://dspace.mit.edu/handle/1721.1/30556}, author = {Hung, Chou P. and Kreiman, Gabriel and Poggio, Tomaso and DiCarlo, James J.} } @proceedings {94, title = {Using {\textquoteleft}read-out{\textquoteright} of object identity to understand object coding in the macaque anterior inferior temporal cortex}, journal = {Computation and Systems Neuroscience (COSYNE)}, year = {2005}, month = {03/2005}, publisher = {COSYNE}, address = {Salt Lake City, Utah, USA}, abstract = {

Recent efforts to develop robust computer vision systems capable of approaching the level of human object recognition performance have shown that models based on neurobiology can outperform non-biological models. To continue the development of such systems, we must understand the codes used by neuronal ensembles to represent object identity--a matter of continued debate in the neuroscience community. Understanding such codes would also allow the development of brain-machine interfaces capable of \‘reading-out\’ or \‘writing-in\’ information to AIT. One approach to understanding these neuronal codes is to obtain a large sample of neuronal data using a fixed set of objects, and then test the ability of various neuronal population measures (codes) to convey knowledge of object identity \– that is, determine the ability of each putative neuronal code to \‘read-out\’ object identity information from AIT. To this end, we have now recorded single-unit activity (SUA), multiunit activity (MUA) and local field potentials (LFP), from over 500 sites in the anterior inferior temporal cortex (AIT) of two macaque monkeys while they viewed a fixed set of 77 complex objects. In this paper, we focus on the temporal latency and resolution of AIT neuronal population codes that best convey knowledge of object identity as assessed by object classification. Specifically, the 77 objects were divided before the experiment into eight different classes: toys, foodstuffs, human faces, monkey faces, hands, vehicles, boxes and cats/dogs. To evaluate the performance of each putative neuronal population code, we used support vector machines (SVM) and jackknife cross-validation to avoid over-fitting. Under the assumption of no covariance across spatially separate sites, these data and methods allowed us to examine putative neuronal population codes that differed in their latency (0 to 100 ms from stimulus onset) and their temporal resolution (12.5 to 200 ms wide bins). Results were evaluated for consistency across different sets of recording sites. We found that MUA and SUA spike count codes could perform single trial object classification. Performance improved with the number of sites and reached ~90\% when ~128 arbitrary AIT sites were included. For any given number of sites, increasing the temporal resolution beyond ~50 ms did not lead to improved performance and the optimal latency from stimulus onset was ~100 ms. Because the neuronal data were not collected simultaneously, we cannot rule out spatio-temporal codes with a time scale less than ~50 ms. However, these results support the hypothesis that the time scale of the object identity code in AIT is ~50 ms and that downstream read-out of object identity for behavior or further processing may simply integrate over this time scale.\ 

}, author = {Hung, Chou P. and Kreiman, Gabriel and Quiroga, Rodrigo Quian and Kraskov, Alexander and Poggio, Tomaso and DiCarlo, James J.} } @proceedings {95, title = {The effect of visual experience on the position tolerance of primate object representations}, journal = {Society for Neuroscience}, year = {2004}, month = {10/2004}, publisher = {SFN}, address = {San Diego, CA, USA}, author = {Cox, David D. and DiCarlo, James J.} } @proceedings {97, title = {Mapping functional neuronal processing chains underlying sensory-motor tasks in the primate}, journal = {Gordon Conference: Sensory coding and the natural environment}, year = {2004}, month = {2004}, address = {Oxford, UK}, author = {DiCarlo, James J. and Maunsell, John H. R.} } @proceedings {96, title = {Object recognition by selective spike and LFP data in macaque inferior temporal cortex}, journal = {Society for Neuroscience}, year = {2004}, month = {10/2004}, publisher = {SFN}, address = {San Diego, CA, USA}, author = {Kreiman, Gabriel and Hung, Chou P. and Poggio, Tomaso and DiCarlo, James J.} } @article {98, title = {Selectivity of local field potentials in macaque inferior temporal cortex}, number = {2004-20}, year = {2004}, month = {09/2004}, institution = {MIT}, address = {Cambridge, M}, abstract = {

While single neurons in inferior temporal (IT) cortex show differential responses to distinct complex stimuli, little is known about the responses of populations of neurons in IT. We recorded single electrode data, including multi-unit activity (MUA) and local field potentials (LFP), from 618 sites in the inferior temporal cortex of macaque monkeys while the animals passively viewed 78 different pictures of complex stimuli. The LFPs were obtained by low-pass filtering the extracellular electrophysiological signal with a corner frequency of 300 Hz. As reported previously, we observed that spike counts from MUA showed selectivity for some of the pictures. Strikingly, the LFP data, which is thought to constitute an average over large numbers of neurons, also showed significantly selective responses. The LFP responses were less selective than the MUA responses both in terms of the proportion of selective sites as well as in the selectivity of each site. We observed that there was only little overlap between the selectivity of MUA and LFP recordings from the same electrode. To assess the spatial organization of selective responses, we compared the selectivity of nearby sites recorded along the same penetration and sites recorded from different penetrations. We observed that MUA selectivity was correlated on spatial scales up to 800 \&$\#$61549;m while the LFP selectivity was correlated over a larger spatial extent, with significant correlations between sites separated by several mm. Our data support the idea that there is some topographical arrangement to the organization of selectivity in inferior temporal cortex and that this organization may be relevant for the representation of object identity in IT.

}, keywords = {AI, inferior temporal cortex, local field potentials, object recognition}, issn = {2004-020}, url = {https://dspace.mit.edu/handle/1721.1/30417}, author = {Kreiman, Gabriel and Hung, Chou P. and Poggio, Tomasso and DiCarlo, James J.} } @article {89, title = {Using Neuronal Latency to Determine Sensory{\textendash}Motor Processing Pathways in Reaction Time Tasks}, journal = {Journal of Neurophysiology}, volume = {93}, year = {2004}, month = {11/2004}, pages = {2974 - 2986}, abstract = {

We describe a new technique that uses the timing of neuronal and behavioral responses to explore the contributions of individual neurons to specific behaviors. The approach uses both the mean neuronal latency and the trial-by-trial covariance between neuronal latency and behavioral response. Reliable measurements of these values were obtained from single-unit recordings made from anterior inferotemporal (AIT) cortex and the frontal eye fields (FEF) in monkeys while they performed a choice reaction time task. These neurophysiological data show that the responses of AIT neurons and some FEF neurons have little covariance with behavioral response, consistent with a largely \"sensory\" response. The responses of another group of FEF neurons with longer mean latency covary tightly with behavioral response, consistent with a largely \"motor\" response. A very small fraction of FEF neurons had responses consistent with an intermediate position in the sensory-motor pathway. These results suggest that this technique is a valuable tool for exploring the functional organization of neuronal circuits that underlie specific behaviors.

}, keywords = {Action Potentials, Afferent, Animal, Animals, Behavior, Macaca mulatta, Male, Models, Motor Neurons, Neural Pathways, Neurological, Neurons, Photic Stimulation, Psychomotor Performance, Reaction Time, Task Performance and Analysis, Temporal Lobe, Time Factors, Visual Fields}, issn = {0022-3077}, doi = {10.1152/jn.00508.2004}, url = {https://www.physiology.org/doi/10.1152/jn.00508.2004}, author = {DiCarlo, James J. and Maunsell, John H. R.} } @article {99, title = {Anterior Inferotemporal Neurons of Monkeys Engaged in Object Recognition Can be Highly Sensitive to Object Retinal Position}, journal = {Journal of Neurophysiology}, volume = {89}, year = {2003}, month = {01/2003}, pages = {3264 - 3278}, abstract = {

Visual object recognition is computationally difficult because changes in an object\&$\#$39;s position, distance, pose, or setting may cause it to produce a different retinal image on each encounter. To robustly recognize objects, the primate brain must have mechanisms to compensate for these variations. Although these mechanisms are poorly understood, it is thought that they elaborate neuronal representations in the inferotemporal cortex that are sensitive to object form but substantially invariant to other image variations. This study examines this hypothesis for image variation resulting from changes in object position. We studied the effect of small differences (+/-1.5 degrees ) in the retinal position of small (0.6 degrees wide) visual forms on both the behavior of monkeys trained to identify those forms and the responses of 146 anterior IT (AIT) neurons collected during that behavior. Behavioral accuracy and speed were largely unaffected by these small changes in position. Consistent with previous studies, many AIT responses were highly selective for the forms. However, AIT responses showed far greater sensitivity to retinal position than predicted from their reported receptive field (RF) sizes. The median AIT neuron showed a approximately 60\% response decrease between positions within +/-1.5 degrees of the center of gaze, and 52\% of neurons were unresponsive to one or more of these positions. Consistent with previous studies, each neuron\&$\#$39;s rank order of target preferences was largely unaffected across position changes. Although we have not yet determined the conditions necessary to observe this marked position sensitivity in AIT responses, we rule out effects of spatial-frequency content, eye movements, and failures to include the RF center. To reconcile this observation with previous studies, we hypothesize that either AIT position sensitivity strongly depends on object size or that position sensitivity is sharpened by extensive visual experience at fixed retinal positions or by the presence of flanking distractors.

}, keywords = {Action Potentials, Animals, Depth Perception, Electrophysiology, Eye Movements, Form Perception, Macaca mulatta, Male, Neurons, Pattern Recognition, Photic Stimulation, Psychomotor Performance, Retina, Temporal Lobe, Time Factors, Visual, Visual Fields, Visual Perception}, issn = {0022-3077}, doi = {10.1152/jn.00358.2002}, url = {https://www.physiology.org/doi/10.1152/jn.00358.2002}, author = {DiCarlo, James J. and Maunsell, John H. R.} } @article {100, title = {Receptive field structure in cortical area 3b of the alert monkey}, journal = {Behavioural Brain Research}, volume = {135}, year = {2002}, month = {01/2002}, pages = {167 - 178}, abstract = {

More than 350 neurons with fingerpad receptive fields (RFs) were studied in cortical area 3b of three alert monkeys. Random dot patterns, which contain all stimulus patterns with equal probability, were scanned across these RFs at three velocities and eight directions to reveal the RFs\’ spatial and temporal structure. Area 3b RFs are characterized by three components: (1) a single, central excitatory region of short duration, (2) one or more inhibitory regions, also of short duration, that are adjacent to and nearly synchronous with the excitation, and (3) a region of inhibition that overlaps the excitation partially or totally and is temporally delayed with respect to the first two components. As a result of these properties, RF spatial structure depends on scanning direction but is virtually unaffected by changes in scanning velocity. This RF characterization, which is derived solely from responses to scanned random-dot patterns, predicts a neuron\&$\#$39;s responses to random patterns accurately, as expected, but it also predicts orientation sensitivity and preferred orientation measured with a scanned bar. Both orientation sensitivity and the ratio of coincident inhibition (number 2 above) to excitation are stronger in the supra- and infragranular layers than in layer IV.

}, keywords = {Action Potentials, Afferent, Animals, Brain Mapping, Evoked Potentials, Haplorhini, Models, Neurological, Neurons, Orientation, Reproducibility of Results, Skin, Somatosensory, Somatosensory Cortex}, issn = {01664328}, doi = {10.1016/S0166-4328(02)00162-6}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0166432802001626}, author = {DiCarlo, James J and Johnson, Kenneth O} } @proceedings {101, title = {Using reaction time tasks to map sensory-motor chains in the monkey}, journal = {Society for Neuroscience}, year = {2002}, month = {10/2002}, publisher = {SFN}, address = {Orlando, FL, USA}, author = {DiCarlo, James J. and Maunsell, John H. R.} } @article {105, title = {Form representation in monkey inferotemporal cortex is virtually unaltered by free viewing}, journal = {Nature Neuroscience}, volume = {3}, year = {2000}, month = {01/2000}, pages = {814 - 821}, abstract = {

How are objects represented in the brain during natural behavior? Visual object recognition in primates is thought to depend on the inferotemporal cortex {(IT).} In most neurophysiological studies of {IT,} monkeys hold their direction of gaze fixed while isolated visual stimuli are presented (controlled viewing). However, during natural behavior, primates visually explore cluttered environments by changing gaze direction several times each second (free viewing). We examined the effect of free viewing on {IT} neuronal responses in monkeys engaged in a form-recognition task. By making small, real-time stimulus adjustments, we produced nearly identically retinal stimulation during controlled and free viewing. Nearly 90\% of neuronal responses were unaffected by free viewing, and average stimulus selectivity was unchanged. Thus, neuronal representations that likely underlie form recognition are virtually unaltered by free viewing.

}, keywords = {Animals, Conditioning, Fixation, Form Perception, Macaca mulatta, Male, Neurons, Ocular, Pattern Recognition, Photic Stimulation, Psychology, Saccades, Temporal Lobe, Visual, Visual Cortex}, issn = {1097-6256}, doi = {10.1038/77722}, url = {http://www.nature.com/articles/nn0800_814}, author = {DiCarlo, James J. and Maunsell, John H. R.} } @proceedings {106, title = {Inferotemporal representations underlying object recognition in the free viewing monkey}, journal = {Society for Neuroscience}, year = {2000}, month = {10/2000}, publisher = {SFN}, address = {New Orleans, LA, USA}, author = {DiCarlo, James J. and Maunsell, John H. R.} } @article {103, title = {Spatial and Temporal Structure of Receptive Fields in Primate Somatosensory Area 3b: Effects of Stimulus Scanning Direction and Orientation}, journal = {The Journal of Neuroscience}, volume = {20}, year = {2000}, month = {01/2000}, pages = {495 - 510}, abstract = {

This is the third in a series of studies of the neural representation of tactile spatial form in somatosensory cortical area 3b of the alert monkey. We previously studied the spatial structure of \>350 fingerpad receptive fields (RFs) with random-dot patterns scanned in one direction (DiCarlo et al., 1998) and at varying velocities (DiCarlo and Johnson, 1999). Those studies showed that area 3b RFs have a wide range of spatial structures that are virtually unaffected by changes in scanning velocity. In this study, 62 area 3b neurons were studied with three to eight scanning directions (58 with four or more directions). The data from all three studies are described accurately by an RF model with three components: (1) a single, central excitatory region of short duration, (2) one or more inhibitory regions, also of short duration, that are adjacent to and nearly synchronous with the excitation, and (3) a region of inhibition that overlaps the excitation partially or totally and is temporally delayed with respect to the first two components. The mean correlation between the observed RFs and the RFs predicted by this three-component model was 0.81. The three-component RFs also predicted orientation sensitivity and preferred orientation to a scanned bar accurately. The orientation sensitivity was determined most strongly by the intensity of the coincident RF inhibition in relation to the excitation. Both orientation sensitivity and this ratio were stronger in the supragranular and infragranular layers than in layer IV.

}, keywords = {Action Potentials, Animals, Discrimination Learning, Fingers, Macaca mulatta, Movement, Normal Distribution, Reaction Time, Somatosensory Cortex, Space Perception, Time Factors, Touch}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.20-01-00495.2000}, url = {http://www.jneurosci.org/lookup/doi/10.1523/JNEUROSCI.20-01-00495.2000}, author = {DiCarlo, James J. and Johnson, Kenneth O.} } @proceedings {108, title = {Form processing in area 3b}, journal = {International Symposium on Brain Mechanisms of Tactile Perception}, year = {1999}, month = {1999}, address = {Stockholm, Sweden}, author = {DiCarlo, James J. and Johnson, Kenneth O} } @article {107, title = {Velocity Invariance of Receptive Field Structure in Somatosensory Cortical Area 3b of the Alert Monkey}, journal = {The Journal of Neuroscience}, volume = {19}, year = {1999}, month = {01/1999}, pages = {401 - 419}, abstract = {

This is the second in a series of studies of the neural representation of tactile spatial form in cortical area 3b of the alert monkey. We previously studied the spatial structure of 330 area 3b neuronal receptive fields (RFs) on the fingerpad with random dot patterns scanned at one velocity (40 mm/sec;\ DiCarlo et al., 1998). Here, we analyze the temporal structure of 84 neuronal RFs by studying their spatial structure at three scanning velocities (20, 40, and 80 mm/sec). As in the previous study, most RFs contained a single, central, excitatory region and one or more surrounding or flanking inhibitory regions. The mean time delay between skin stimulation and its excitatory effect was 15.5 msec. Except for differences in mean rate, each neuron\’s response and the spatial structure of its RF were essentially unaffected by scanning velocity. This is the expected outcome when excitatory and inhibitory effects are brief and synchronous. However, that interpretation is consistent neither with the reported timing of excitation and inhibition in somatosensory cortex nor with the third study in this series, which investigates the effect of scanning direction and shows that one component of inhibition lags behind excitation. We reconcile these observations by showing that overlapping (in-field) inhibition delayed relative to excitation can produce RF spatial structure that is unaffected by changes in scanning velocity. Regardless of the mechanisms, the velocity invariance of area 3b RF structure is consistent with the velocity invariance of tactile spatial perception (e.g., roughness estimation and form recognition).

}, keywords = {Adaptation, Animals, Brain Mapping, Cortical Synchronization, Evoked Potentials, Female, Macaca mulatta, Male, Neural Inhibition, Physiological, Somatosensory Cortex, Visual Fields}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.19-01-00401.1999}, url = {http://www.jneurosci.org/lookup/doi/10.1523/JNEUROSCI.19-01-00401.1999}, author = {DiCarlo, James J. and Johnson, Kenneth O.} } @article {110, title = {Structure of Receptive Fields in Area 3b of Primary Somatosensory Cortex in the Alert Monkey}, journal = {The Journal of Neuroscience}, volume = {18}, year = {1998}, month = {04/1998}, pages = {2626 - 2645}, abstract = {

We investigated the two-dimensional structure of area 3b neuronal receptive fields (RFs) in three alert monkeys. Three hundred thirty neurons with RFs on the distal fingerpads were studied with scanned, random dot stimuli. Each neuron was stimulated continuously for 14 min, yielding 20,000 response data points. Excitatory and inhibitory components of each RF were determined with a modified linear regression algorithm. Analyses assessing goodness-of-fit, repeatability, and generality of the RFs were developed. Two hundred forty-seven neurons yielded highly repeatable RF estimates, and most RFs accounted for a large fraction of the explainable response of each neuron. Although the area 3b RF structures appeared to be continuously distributed, certain structural generalities were apparent. Most RFs (94\%) contained a single, central region of excitation and one or more regions of inhibition located on one, two, three, or all four sides of the excitatory center. The shape, area, and strength of excitatory and inhibitory RF regions ranged widely. Half the RFs contained almost evenly balanced excitation and inhibition. The findings indicate that area 3b neurons act as local spatiotemporal filters that are maximally excited by the presence of particular stimulus features. We believe that form and texture perception are based on high-level representations and that area 3b is an intermediate stage in the processes leading to these representations. Two possibilities are considered: (1) that these high-level representations are basically somatotopic and that area 3b neurons amplify some features and suppress others, or (2) that these representations are highly transformed and that area 3b effects a step in the transformation.

}, keywords = {Afferent, Animals, Data Interpretation, Electrophysiology, Female, Macaca mulatta, Male, Neural Inhibition, Neurons, Reproducibility of Results, Somatosensory Cortex, Statistical, Touch}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.18-07-02626.1998}, url = {http://www.jneurosci.org/lookup/doi/10.1523/JNEUROSCI.18-07-02626.1998}, author = {DiCarlo, James J. and Johnson, Kenneth O. and Hsiao, Steven S.} } @proceedings {112, title = {Spatial and temporal properties of neural receptive fields in area 3b of the awake monkey}, journal = {Society for Neuroscience}, year = {1997}, month = {10/2007}, publisher = {SFN}, address = {New Orleans, LA, USA}, author = {DiCarlo, James J. and Hsiao, Steven S. and Johnson, Kenneth O} } @inbook {113, title = {Form processing and attention effects in somatosensory cortex}, booktitle = {Somesthesis and the Neurobiology of the Somatosensory Cortex}, year = {1996}, publisher = {Birkhauser Basel}, organization = {Birkhauser Basel}, address = {Switzerland}, issn = {978-3-0348-9016-8}, author = {Hsiao, Steven S. and Johnson, Kenneth O and Twombly, IA and DiCarlo, James J.}, editor = {Franzen, O and Johansson, R and Terenius, L} } @proceedings {114, title = {Laminar differences in spatiotemporal receptive field structure of neurons in area 3b of the awake macaque}, journal = {Society for Neuroscience}, year = {1996}, month = {10/1996}, publisher = {SFN}, address = {Washington, DC, USA}, author = {DiCarlo, James J. and Twombly, IA and Hsiao, Steven S. and Johnson, Kenneth O} } @proceedings {115, title = {Linear and non-linear processing of tactile spatial form in area 3b of the awake macaque}, journal = {Society for Neuroscience}, year = {1996}, month = {10/1996}, publisher = {SFN}, address = {Washington, DC, USA}, author = {Twombly, IA and DiCarlo, James J. and Hsiao, Steven S. and Johnson, Kenneth O} } @article {116, title = {Marking microelectrode penetrations with fluorescent dyes}, journal = {Journal of Neuroscience Methods}, volume = {64}, year = {1996}, month = {01/1996}, pages = {75 - 81}, abstract = {

Fluorescent dyes were used to mark and identify the tracks left by extracellular microelectrodes in neurophysiological experiments. Forty-two penetrations were made into the postcentral gyrus of 3 Macaque monkeys with electrodes coated with 1 of 5 fluorescent dyes (Dil, DiO, DiI-C5, PyPO, and Fast Blue). The electrodes were driven at rates ranging from 10 to 1000 \μm/min, to a depth of about 4000 \μm, where a small electrolytic lesion was made. Histological sections were viewed under fluorescent optics and the electrode tracks were reconstructed from the dye traces. Fluorescent traces (width 50\–400 \μm) were observed in 41 of 42 penetrations with 24 traces extending to the lesion site. Of the electrodes driven in less than 3 h, those coated with DiI (88) and DiI-C5 (88) left a trace to the lesion site, while 57\% (47) of the DiO, 40\% (25) of the Fast Blue and only 11\% (19) of the PyPO tracks were fully marked.

This method of marking penetrations can be used with any extracellular recording configuration, does not require tissue sections to be processed or stained, does not require electrical lesions, and causes no detectable tissue damage. Because the dyes fluoresce at different wavelengths, closely spaced tracks can be uniquely identified.

}, keywords = {Animals, Brain, Electrophysiology, Fluorescent Dyes, Macaca mulatta, Microelectrodes, Neurosciences}, issn = {01650270}, doi = {10.1016/0165-0270(95)00113-1}, url = {https://linkinghub.elsevier.com/retrieve/pii/0165027095001131}, author = {DiCarlo, James J. and Lane, John W. and Hsiao, Steven S. and Johnson, Kenneth O.} } @proceedings {117, title = {Animals, Brain, Electrophysiology, Fluorescent Dyes, Macaca mulatta, Microelectrodes, Neurosciences}, journal = {Biomedical Engineering Society}, year = {1995}, month = {1995}, address = {Boston, MA}, author = {Hsiao, Steven S. and DiCarlo, James J. and Johnson, Kenneth O} } @proceedings {118, title = {Transformation of tactile spatial form within a cortical column in area 3b of the macaque}, journal = {Society for Neuroscience}, year = {1994}, month = {10/1994}, publisher = {SFN}, address = {Miami, FL}, author = {DiCarlo, James J. and Hsiao, Steven S. and Johnson, Kenneth O} } @article {119, title = {Stimulus configuration, classical conditioning, and hippocampal function.}, journal = {Psychological Review}, volume = {99}, year = {1992}, month = {04/1992}, pages = {268 - 305}, abstract = {

Hippocampal participation in classical conditioning is described in terms of a multilayer network that portrays stimulus configuration. The network (a) describes behavior in real time, (b) incorporates a layer of \"hidden\" units positioned between input and output units, (c) includes inputs that are connected to the output directly as well as indirectly through the hidden-unit layer, and (d) uses a biologically plausible backpropagation procedure to train the hidden-unit layer. Nodes and connections in the neural network are mapped onto regional cerebellar, cortical, and hippocampal circuits, and the effect of lesions of different brain regions is formally studied. Computer simulations of the following classical conditioning paradigms are presented: acquisition of delay and trace conditioning, extinction, acquisition-extinction series of delay conditioning, blocking, over-shadowing, discrimination acquisition, discrimination reversal, feature-positive discrimination, conditioned inhibition, negative patterning, positive patterning, and generalization. The model correctly describes the effect of hippocampal and cortical lesions in many of these paradigms, as well as neural activity in hippocampus and medial septum during classical conditioning. Some of these results might be extended to the description of anterograde amnesia in human patients.

}, keywords = {Animals, Association Learning, Brain Mapping, Cerebellum, Cerebral Cortex, Classical, Computer Simulation, Conditioning, Hippocampus, Humans, Models, Neural Pathways, Neurological}, issn = {0033-295X}, doi = {10.1037/0033-295X.99.2.268}, url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/0033-295X.99.2.268}, author = {Schmajuk, Nestor A. and DiCarlo, James J.} } @article {120, title = {A hippocampal theory of schizophrenia}, journal = {Behavioral and Brain Sciences}, volume = {14}, year = {1991}, month = {01/1991}, pages = {47-49}, issn = {0140-525X}, doi = {10.1017/S0140525X00065353}, url = {https://www.cambridge.org/core/journals/behavioral-and-brain-sciences/article/hippocampal-theory-of-schizophrenia/D6F6C7FD873BCA595607226B4BD349B1}, author = {Schmajuk, Nestor A. and DiCarlo, James J.} } @inbook {121, title = {Neural dynamics of hippocampal modulation of classical conditioning}, booktitle = {Neural Network Models of Conditioning and Action}, year = {1991}, publisher = {Lawrence Erlbaum Association}, organization = {Lawrence Erlbaum Association}, address = {Hillsdale, NJ}, author = {Schmajuk, Nestor A. and DiCarlo, James J.}, editor = {Commons, M and Grossberg, S and Staddon, JER} } @article {122, title = {A neural network approach to hippocampal function in classical conditioning.}, journal = {Behavioral Neuroscience}, volume = {105}, year = {1991}, month = {01/1991}, pages = {82 - 110}, abstract = {

Hippocampal participation in classical conditioning in terms of S. Grossberg\&$\#$39;s (1975) attentional theory is described. According to the theory, pairing of a conditioned stimulus/stimuli (CS) with an unconditioned stimulus/stimuli (UCS) causes both an association of the sensory representation of the CS with the UCS (conditioned reinforcement learning) and an association of the sensory representation of the CS with the drive representation of the UCS (incentive motivation learning). Sensory representations compete for a limited-capacity short-term memory (STM). The STM regulation hypothesis, which proposes that the hippocampus controls incentive motivation, self-excitation, and competition among sensory representations thereby regulating the contents of a limited capacity STM, is introduced. Under the STM regulation hypothesis, nodes and connections in Grossberg\&$\#$39;s neural network are mapped onto regional hippocampal-cerebellar circuits. The resulting neural model provides (a) a framework for understanding the dynamics of information processing and storage in the hippocampus and cerebellum during classical conditioning of the rabbit\&$\#$39;s nictitating membrane, (b) principles for understanding the effect of different hippocampal manipulations on classical conditioning, and (c) novel and testable predictions.\ 

}, keywords = {Animals, Cerebellum, Classical, Computer Simulation, Conditioning, Extinction, Eyelid, Hippocampus, Models, Nerve Net, Neurological, Neurons, Psychological, Rabbits, Reaction Time}, issn = {0735-7044}, doi = {10.1037/0735-7044.105.1.82}, url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/0735-7044.105.1.82}, author = {Schmajuk, Nestor A. and DiCarlo, James J.} } @proceedings {123, title = {The short-term memory regulation hypothesis of hippocampal function}, journal = {Midwestern Psychology Association}, year = {1990}, month = {1990}, address = {Chicago, IL, USA}, author = {Schmajuk, Nestor A. and DiCarlo, James J.} } @proceedings {124, title = {Neural dynamics of hippocampal modulation of classical conditioning}, journal = {12th Symposium on Models of Behavior: Neural Network Models of Conditioning and Action}, year = {1989}, month = {1989}, address = {Cambridge, MA}, author = {Schmajuk, Nestor A. and DiCarlo, James J.} }