@inproceedings{14326,
  abstract     = {Learning object-centric representations of complex scenes is a promising step towards enabling efficient abstract reasoning from low-level perceptual features. Yet, most deep learning approaches learn distributed representations that do not capture the compositional properties of natural scenes. In this paper, we present the Slot Attention module, an architectural component that interfaces with perceptual representations such as the output of a convolutional neural network and produces a set of task-dependent abstract representations which we call slots. These slots are exchangeable and can bind to any object in the input by specializing through a competitive procedure over multiple rounds of attention. We empirically demonstrate that Slot Attention can extract object-centric representations that enable generalization to unseen compositions when trained on unsupervised object discovery and supervised property prediction tasks.

},
  author       = {Locatello, Francesco and Weissenborn, Dirk and Unterthiner, Thomas and Mahendran, Aravindh and Heigold, Georg and Uszkoreit, Jakob and Dosovitskiy, Alexey and Kipf, Thomas},
  booktitle    = {34th International Conference on Neural Information Processing Systems},
  isbn         = {9781713829546},
  issn         = {1049-5258},
  location     = {Virtual},
  pages        = {11525--11538},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Object-centric learning with slot attention}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{15086,
  abstract     = {Many communication-efficient variants of SGD use gradient quantization schemes. These schemes are often heuristic and fixed over the course of training. We empirically observe that the statistics of gradients of deep models change during the training. Motivated by this observation, we introduce two adaptive quantization schemes, ALQ and AMQ. In both schemes, processors update their compression schemes in parallel by efficiently computing sufficient statistics of a parametric distribution. We improve the validation accuracy by almost 2% on CIFAR-10 and 1% on ImageNet in challenging low-cost communication setups. Our adaptive methods are also significantly more robust to the choice of hyperparameters.

},
  author       = {Faghri, Fartash  and Tabrizian, Iman  and Markov, Ilia and Alistarh, Dan-Adrian and Roy, Daniel  and Ramezani-Kebrya, Ali },
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713829546},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Adaptive gradient quantization for data-parallel SGD}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{8188,
  abstract     = {A natural approach to generative modeling of videos is to represent them as a composition of moving objects. Recent works model a set of 2D sprites over a slowly-varying background, but without considering the underlying 3D scene that
gives rise to them. We instead propose to model a video as the view seen while moving through a scene with multiple 3D objects and a 3D background. Our model is trained from monocular videos without any supervision, yet learns to
generate coherent 3D scenes containing several moving objects. We conduct detailed experiments on two datasets, going beyond the visual complexity supported by state-of-the-art generative approaches. We evaluate our method on
depth-prediction and 3D object detection---tasks which cannot be addressed by those earlier works---and show it out-performs them even on 2D instance segmentation and tracking.},
  author       = {Henderson, Paul M and Lampert, Christoph},
  booktitle    = {34th Conference on Neural Information Processing Systems},
  isbn         = {9781713829546},
  location     = {Vancouver, Canada},
  pages        = {3106–3117},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Unsupervised object-centric video generation and decomposition in 3D}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{9631,
  abstract     = {The ability to leverage large-scale hardware parallelism has been one of the key enablers of the accelerated recent progress in machine learning. Consequently, there has been considerable effort invested into developing efficient parallel variants of classic machine learning algorithms. However, despite the wealth of knowledge on parallelization, some classic machine learning algorithms often prove hard to parallelize efficiently while maintaining convergence. In this paper, we focus on efficient parallel algorithms for the key machine learning task of inference on graphical models, in particular on the fundamental belief propagation algorithm. We address the challenge of efficiently parallelizing this classic paradigm by showing how to leverage scalable relaxed schedulers in this context. We present an extensive empirical study, showing that our approach outperforms previous parallel belief propagation implementations both in terms of scalability and in terms of wall-clock convergence time, on a range of practical applications.},
  author       = {Aksenov, Vitaly and Alistarh, Dan-Adrian and Korhonen, Janne},
  isbn         = {9781713829546},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  pages        = {22361--22372},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Scalable belief propagation via relaxed scheduling}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{9632,
  abstract     = {Second-order information, in the form of Hessian- or Inverse-Hessian-vector products, is a fundamental tool for solving optimization problems. Recently, there has been significant interest in utilizing this information in the context of deep
neural networks; however, relatively little is known about the quality of existing approximations in this context. Our work examines this question, identifies issues with existing approaches, and proposes a method called WoodFisher to compute a faithful and efficient estimate of the inverse Hessian. Our main application is to neural network compression, where we build on the classic Optimal Brain Damage/Surgeon framework. We demonstrate that WoodFisher significantly outperforms popular state-of-the-art methods for oneshot pruning. Further, even when iterative, gradual pruning is allowed, our method results in a gain in test accuracy over the state-of-the-art approaches, for standard image classification datasets such as ImageNet ILSVRC. We examine how our method can be extended to take into account first-order information, as well as
illustrate its ability to automatically set layer-wise pruning thresholds and perform compression in the limited-data regime. The code is available at the following link, https://github.com/IST-DASLab/WoodFisher.},
  author       = {Singh, Sidak Pal and Alistarh, Dan-Adrian},
  isbn         = {9781713829546},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  pages        = {18098--18109},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{WoodFisher: Efficient second-order approximation for neural network compression}},
  volume       = {33},
  year         = {2020},
}