@conference {173, title = {Primate Inferotemporal Cortex Neurons Generalize Better to Novel Image Distributions Than Analogous Deep Neural Networks Units}, booktitle = {SVHRM Workshop at Neural Information Processing Systems (NeurIPS)}, year = {2022}, month = {2022 }, address = {Lisbon, Portugal}, abstract = {

Humans are successfully able to recognize objects in a variety of image distributions. Today\&$\#$39;s artificial neural networks (ANNs), on the other hand, struggle to recognize objects in many image domains, especially those different from the training distribution. It is currently unclear which parts of the ANNs could be improved in order to close this generalization gap. In this work, we used recordings from primate high-level visual cortex (IT) to isolate whether ANNs lag behind primate generalization capabilities because of their encoder (transformations up to the penultimate layer), or their decoder (linear transformation into class labels). Specifically, we fit a linear decoder on images from one domain and evaluate transfer performance on twelve held-out domains, comparing fitting on primate IT representations vs. representations in ANN penultimate layers. To fairly compare, we scale the number of each ANN\&$\#$39;s units so that its in-domain performance matches that of the sampled IT population (i.e. 71 IT neural sites, 73\% binary-choice accuracy). We find that the sampled primate population achieves, on average, 68\% performance on the held-out-domains. Comparably sampled populations from ANN model units generalize less well, maintaining on average 60\%. This is independent of the number of sampled units: models\&$\#$39; out-of-domain accuracies consistently lag behind primate IT. These results suggest that making ANN model units more like primate IT will improve the generalization performance of ANNs.

}, url = {https://openreview.net/pdf?id=iPF7mhoWkOl}, author = {Bagus, Ayu Marliawaty I Gusti and Marques, Tiago and Sanghavi, Sachi and DiCarlo, James J and Schrimpf, Martin} } @conference {146, title = {Wiring Up Vision: Minimizing Supervised Synaptic Updates Needed to Produce a Primate Ventral Stream}, booktitle = {International Conference on Learning Representations 2022 Spotlight}, year = {2022}, month = {April 25, 2022}, type = {preprint}, abstract = {

After training on large datasets, certain deep neural networks are surprisingly good models of the neural mechanisms of adult primate visual object recognition. Nevertheless, these models are considered poor models of the development of the visual system because they posit millions of sequential, precisely coordinated synaptic updates, each based on a labeled image. While ongoing research is pursuing the use of unsupervised proxies for labels, we here explore a complementary strategy of reducing the required number of supervised synaptic updates to produce an adult-like ventral visual stream (as judged by the match to V1, V2, V4, IT, and behavior). Such models might require less precise machinery and energy expenditure to coordinate these updates and would thus move us closer to viable neuroscientific hypotheses about how the visual system wires itself up. Relative to standard model training on labeled images in ImageNet, we here demonstrate that the total number of supervised weight updates can be substantially reduced using three complementary strategies: First, we find that only 2\% of supervised updates (epochs and images) are needed to achieve \∼80\% of a fully trained model\’s match to adult ventral stream. Specifically, training benefits predictions of higher visual cortex the most whereas predictions of earlier areas improve only marginally over the course of training. Second, by improving the random distribution of synaptic connectivity, we find that 54\% of the brain match can already be achieved \“at birth\” (i.e. no training at all). Third, we find that, by training only \∼5\% of model synapses, we can still achieve nearly 80\% of the match to the ventral stream. This approach further improves on ImageNet performance over previous attempts in computer vision of minimizing trained components without substantially increasing the number of trained parameters. These results reflect first steps in modeling not just primate adult visual processing during inference, but also how the ventral visual stream might be \“wired up\” by evolution (a model\’s \“birth\” state) and by developmental learning (a model\’s updates based on visual experience).

}, keywords = {biologically plausible learning, computational neuroscience, convolutional neural networks, primate visual ventral stream}, doi = {10.1101/2020.06.08.140111}, url = {https://openreview.net/pdf?id=g1SzIRLQXMM}, author = {Geiger, Franziska and Schrimpf, Martin and Marques, Tiago and DiCarlo, James J} } @proceedings {163, title = {Combining Different V1 Brain Model Variants to Improve Robustness to Image Corruptions in CNNs}, journal = {Shared Visual Representations in Human \& Machine Intelligence - NeurIPS Workshop}, year = {2021}, month = {October 20, 2021}, publisher = {Neural Information Processing Systems}, abstract = {

While some convolutional neural networks (CNNs) have surpassed human visual abilities in object classification, they often struggle to recognize objects in images corrupted with different types of common noise patterns, highlighting a major limitation of this family of models. Recently, it has been shown that simulating a primary visual cortex (V1) at the front of CNNs leads to small improvements in robustness to these image perturbations. In this study, we start with the observation that different variants of the V1 model show gains for specific corruption types. We then build a new model using an ensembling technique, which combines multiple individual models with different V1 front-end variants. The model ensemble leverages the strengths of each individual model, leading to significant improvements in robustness across all corruption categories and outperforming the base model by 38\% on average. Finally, we show that using distillation, it is possible to partially compress the knowledge in the ensemble model into a single model with a V1 front-end. While the ensembling and distillation techniques used here are hardly biologically-plausible, the results presented here demonstrate that by combining the specific strengths of different neuronal circuits in V1 it is possible to improve the robustness of CNNs for a wide range of perturbations.

}, url = {https://arxiv.org/abs/2110.10645}, author = {Baidya, Avinash and Dapello, Joel and DiCarlo, James J and Marques, Tiago} } @article {151, title = {Simulating a Primary Visual Cortex at the Front of CNNs Improves Robustness to Image Perturbations}, journal = {Neural Information Processing Systems (NeurIPS; spotlight)}, year = {2020}, month = {June 17, 2020}, type = {preprint}, abstract = {

Current state-of-the-art object recognition models are largely based on convolutional neural network (CNN) architectures, which are loosely inspired by the primate visual system. However, these CNNs can be fooled by imperceptibly small, explicitly crafted perturbations, and struggle to recognize objects in corrupted images that are easily recognized by humans. Here, by making comparisons with primate neural data, we first observed that CNN models with a neural hidden layer that better matches primate primary visual cortex (V1) are also more robust to adversarial attacks. Inspired by this observation, we developed VOneNets, a new class of hybrid CNN vision models. Each VOneNet contains a fixed weight neural network front-end that simulates primate V1, called the VOneBlock, followed by a neural network back-end adapted from current CNN vision models. The VOneBlock is based on a classical neuroscientific model of V1: the linear-nonlinear-Poisson model, consisting of a biologically-constrained Gabor filter bank, simple and complex cell nonlinearities, and a V1 neuronal stochasticity generator. After training, VOneNets retain high ImageNet performance, but each is substantially more robust, outperforming the base CNNs and state-of-the-art methods by 18\% and 3\%, respectively, on a conglomerate benchmark of perturbations comprised of white box adversarial attacks and common image corruptions. Finally, we show that all components of the VOneBlock work in synergy to improve robustness. While current CNN architectures are arguably brain-inspired, the results presented here demonstrate that more precisely mimicking just one stage of the primate visual system leads to new gains in ImageNet-level computer vision applications.

}, doi = {10.1101/2020.06.16.154542}, url = {https://www.biorxiv.org/content/10.1101/2020.06.16.154542v27}, author = {Dapello, Joel and Marques, Tiago and Schrimpf, Martin and Geiger, Franziska and Cox, David D and DiCarlo, James J} }