@inproceedings{21068,
  abstract     = {Causal reasoning and discovery, two fundamental tasks of causal analysis,
often face challenges in applications due to the complexity, noisiness, and highdimensionality of real-world data. Despite recent progress in identifying latent
causal structures using causal representation learning (CRL), what makes learned
representations useful for causal downstream tasks and how to evaluate them are
still not well understood. In this paper, we reinterpret CRL using a measurement
model framework, where the learned representations are viewed as proxy measurements of the latent causal variables. Our approach clarifies the conditions under
which learned representations support downstream causal reasoning and provides
a principled basis for quantitatively assessing the quality of representations using
a new Test-based Measurement EXclusivity (T-MEX) score. We validate T-MEX
across diverse causal inference scenarios, including numerical simulations and
real-world ecological video analysis, demonstrating that the proposed framework
and corresponding score effectively assess the identification of learned representations and their usefulness for causal downstream tasks. Reproducible code can
be found at https://github.com/shimenghuang/a-measurement-perspective-of-crl.},
  author       = {Yao, Dingling and Huang, Shimeng and Cadei, Riccardo and Zhang, Kun and Locatello, Francesco},
  booktitle    = {39th Annual Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {San Diego, CA, United States},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{The third pillar of causal analysis? A measurement perspective on causal representations}},
  volume       = {38},
  year         = {2025},
}

@inproceedings{21070,
  abstract     = {Deep learning systems deployed in real-world applications often encounter data that is different from their in-distribution (ID). A reliable model should ideally abstain from making decisions in this out-of-distribution (OOD) setting. Existing state-of-the-art methods primarily focus on feature distances, such as k-th nearest neighbors and distances to decision boundaries, either overlooking or ineffectively using in-distribution statistics. In this work, we propose a novel angle-based metric for OOD detection that is computed relative to the in-distribution structure. We demonstrate that the angles between feature representations and decision boundaries, viewed from the mean of in-distribution features, serve as an effective discriminative factor between ID and OOD data. We evaluate our method on nine ImageNet-pretrained models. Our approach achieves the lowest FPR in 5 out of 9 ImageNet models, obtains the best average FPR overall, and consistently ranking among the top 3 across all evaluated models. Furthermore, we highlight the benefits of contrastive representations by showing strong performance with ResNet SCL and CLIP architectures. Finally, we demonstrate that the scale-invariant nature of our score enables an ensemble strategy via simple score summation. },
  author       = {Demirel, Berker and Fumero, Marco  and Locatello, Francesco},
  booktitle    = {39th Annual Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {San Diego, CA, United States},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Out-of-Distribution detection with relative angles}},
  volume       = {38},
  year         = {2025},
}

@inproceedings{21072,
  abstract     = {Language and vision-language models have shown impressive performance across a wide range of tasks, but their internal mechanisms remain only partly understood. In this work, we study how individual attention heads in text-generative models specialize in specific semantic or visual attributes. Building on an established interpretability method, we reinterpret the practice of probing intermediate activations with the final decoding layer through the lens of signal processing. This lets us analyze multiple samples in a principled way and rank attention heads based on their relevance to target concepts. Our results show consistent patterns of specialization at the head level across both unimodal and multimodal transformers. Remarkably, we find that editing as few as 1% of the heads, selected using our method, can reliably suppress or enhance targeted concepts in the model output. We validate our approach on language tasks such as question answering and toxicity mitigation, as well as vision-language tasks including image classification and captioning. Our findings highlight an interpretable and controllable structure within attention layers, offering simple tools for understanding and editing large-scale generative models.},
  author       = {Basile, Lorenzo and Maiorca, Valentino and Doimo, Diego and Locatello, Francesco and Cazzaniga, Alberto},
  booktitle    = {39th Annual Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {San Diego, CA, United States},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Head pursuit: Probing attention specialization in multimodal transformers}},
  volume       = {38},
  year         = {2025},
}

@inproceedings{21074,
  abstract     = {Neural models learn representations of high-dimensional data on low-dimensional manifolds. Multiple factors, including stochasticities in the training process, model architectures, and additional inductive biases, may induce different representations, even when learning the same task on the same data. However, it has recently been shown that when a latent structure is shared between distinct latent spaces, relative distances between representations can be preserved, up to distortions. Building on this idea, we demonstrate that exploiting the differential-geometric structure of latent spaces of neural models, it is possible to capture precisely the transformations between representational spaces trained on similar data distributions. Specifically, we assume that distinct neural models parametrize approximately the same underlying manifold, and introduce a representation based on the pullback metric that captures the intrinsic structure of the latent space, while scaling efficiently to large models. We validate experimentally our method on model stitching and retrieval tasks, covering autoencoders and vision foundation discriminative models, across diverse architectures, datasets, pretraining schemes and modalities. Code is available at the following link.},
  author       = {Yu, Hanlin and Inal, Befrin and Arvanitidis, Georgios and Hauberg, Soren and Locatello, Francesco and Fumero, Marco},
  booktitle    = {39th Annual Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {San Diego, CA, United States},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Connecting neural models latent geometries with relative geodesic representations}},
  volume       = {38},
  year         = {2025},
}

@inproceedings{21076,
  abstract     = {In many scientific experiments, the data annotating cost constraints the pace for testing novel hypotheses. Yet, modern machine learning pipelines offer a promising solution—provided their predictions yield correct conclusions. We focus on Prediction-Powered Causal Inferences (PPCI), i.e., estimating the treatment effect in an unlabeled target experiment, relying on training data with the same outcome annotated but potentially different treatment or effect modifiers. We first show that conditional calibration guarantees valid PPCI at population level. Then, we introduce a sufficient representation constraint transferring validity across experiments, which we propose to enforce in practice in Deconfounded Empirical Risk Minimization, our new model-agnostic training objective. We validate our method on synthetic and real-world scientific data, solving impossible problem instances for Empirical Risk Minimization even with standard invariance constraints. In particular, for the first time, we achieve valid causal inference on a scientific experiment with complex recording and no human annotations, fine-tuning a foundational model on our similar annotated experiment.},
  author       = {Cadei, Riccardo and Demirel, Ilker and De Bartolomeis, Piersilvio and Lindorfer, Lukas and Cremer, Sylvia and Schmid, Cordelia and Locatello, Francesco},
  booktitle    = {39th Annual Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {San Diego, CA, United States},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Prediction-powered causal inferences}},
  volume       = {38},
  year         = {2025},
}

@inproceedings{19510,
  abstract     = {We propose a new variant of the Adam optimizer [Kingma and Ba, 2014] called
MICROADAM that specifically minimizes memory overheads, while maintaining
theoretical convergence guarantees. We achieve this by compressing the gradient
information before it is fed into the optimizer state, thereby reducing its memory
footprint significantly. We control the resulting compression error via a novel
instance of the classical error feedback mechanism from distributed optimization [Seide et al., 2014, Alistarh et al., 2018, Karimireddy et al., 2019] in which
the error correction information is itself compressed to allow for practical memory
gains. We prove that the resulting approach maintains theoretical convergence
guarantees competitive to those of AMSGrad, while providing good practical performance. Specifically, we show that MICROADAM can be implemented efficiently
on GPUs: on both million-scale (BERT) and billion-scale (LLaMA) models, MICROADAM provides practical convergence competitive to that of the uncompressed
Adam baseline, with lower memory usage and similar running time. Our code is
available at https://github.com/IST-DASLab/MicroAdam.},
  author       = {Modoranu, Ionut-Vlad and Safaryan, Mher and Malinovsky, Grigory and Kurtic, Eldar and Robert, Thomas and Richtárik, Peter and Alistarh, Dan-Adrian},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19511,
  abstract     = {We introduce QuaRot, a new Quantization scheme based on Rotations, which is able to quantize LLMs end-to-end, including all weights, activations, and KV cache in 4 bits. QuaRot rotates LLMs in a way that removes outliers from the hidden state without changing the output, making quantization easier. This computational invariance is applied to the hidden state (residual) of the LLM, as well as to the activations of the feed-forward components, aspects of the attention mechanism, and to the KV cache. The result is a quantized model where all matrix multiplications are performed in 4 bits, without any channels identified for retention in higher precision. Our 4-bit quantized LLAMA2-70B model has losses of at most 0.47 WikiText-2 perplexity and retains 99% of the zero-shot performance. We also show that QuaRot can provide lossless 6 and 8 bit LLAMA-2 models without any calibration data using round-to-nearest quantization. Code is available at github.com/spcl/QuaRot.},
  author       = {Ashkboos, Saleh and Mohtashami, Amirkeivan and Croci, Maximilian L. and Li, Bo and Cameron, Pashmina and Jaggi, Martin and Alistarh, Dan-Adrian and Hoefler, Torsten and Hensman, James},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{QuaRot: Outlier-free 4-bit inference in rotated LLMs}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19512,
  abstract     = {Differential privacy with gradual expiration models the setting where data items
arrive in a stream and at a given time t the privacy loss guaranteed for a data item
seen at time (t − d) is εg(d), where g is a monotonically non-decreasing function.
We study the fundamental continual (binary) counting problem where each data
item consists of a bit, and the algorithm needs to output at each time step the sum of
all the bits streamed so far. For a stream of length T and privacy without expiration
continual counting is possible with maximum (over all time steps) additive error
O(log2
(T)/ε) and the best known lower bound is Ω(log(T)/ε); closing this gap
is a challenging open problem.
We show that the situation is very different for privacy with gradual expiration by
giving upper and lower bounds for a large set of expiration functions g. Specifically,
our algorithm achieves an additive error of O(log(T)/ε) for a large set of privacy
expiration functions. We also give a lower bound that shows that if C is the additive
error of any ε-DP algorithm for this problem, then the product of C and the privacy
expiration function after 2C steps must be Ω(log(T)/ε). Our algorithm matches
this lower bound as its additive error is O(log(T)/ε), even when g(2C) = O(1).
Our empirical evaluation shows that we achieve a slowly growing privacy loss
with significantly smaller empirical privacy loss for large values of d than a natural
baseline algorithm.},
  author       = {Andersson, Joel Daniel and Henzinger, Monika H and Pagh, Rasmus and Steiner, Teresa Anna and Upadhyay, Jalaj},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Continual counting with gradual privacy expiration}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19515,
  abstract     = {Neural models learn data representations that lie on low-dimensional manifolds,
yet modeling the relation between these representational spaces is an ongoing challenge. By integrating spectral geometry principles into neural modeling, we show
that this problem can be better addressed in the functional domain, mitigating complexity, while enhancing interpretability and performances on downstream tasks.
To this end, we introduce a multi-purpose framework to the representation learning
community, which allows to: (i) compare different spaces in an interpretable way
and measure their intrinsic similarity; (ii) find correspondences between them, both
in unsupervised and weakly supervised settings, and (iii) to effectively transfer
representations between distinct spaces. We validate our framework on various
applications, ranging from stitching to retrieval tasks, and on multiple modalities,
demonstrating that Latent Functional Maps can serve as a swiss-army knife for
representation alignment},
  author       = {Fumero, Marco and Pegoraro, Marco and Maiorca, Valentino and Locatello, Francesco and Rodolà, Emanuele},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Latent functional maps: A spectral framework for representation alignment}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19517,
  abstract     = {In this paper, we present a novel data-free method for merging neural networks in weight space. Differently from most existing works, our method optimizes for the permutations of network neurons globally across all layers. This allows us to enforce cycle consistency of the permutations when merging n ≥ 3 models, allowing circular compositions of permutations to be computed without accumulating error along the path. We qualitatively and quantitatively motivate the need for such a constraint, showing its benefits when merging sets of models in scenarios spanning varying architectures and datasets. We finally show that, when coupled
with activation renormalization, our approach yields the best results in the task.},
  author       = {Crisostomi, Donato and Fumero, Marco and Baieri, Daniele and Bernard, Florian and Rodolà, Emanuele},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{C2M3: Cycle-consistent multi-model merging}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19518,
  abstract     = {The rising footprint of machine learning has led to a focus on imposing model
sparsity as a means of reducing computational and memory costs. For deep neural
networks (DNNs), the state-of-the-art accuracy-vs-sparsity is achieved by heuristics
inspired by the classical Optimal Brain Surgeon (OBS) framework [LeCun et al.,
1989, Hassibi and Stork, 1992, Hassibi et al., 1993], which leverages loss curvature
information to make better pruning decisions. Yet, these results still lack a solid
theoretical understanding, and it is unclear whether they can be improved by
leveraging connections to the wealth of work on sparse recovery algorithms. In this
paper, we draw new connections between these two areas and present new sparse
recovery algorithms inspired by the OBS framework that comes with theoretical
guarantees under reasonable assumptions and have strong practical performance.
Specifically, our work starts from the observation that we can leverage curvature
information in OBS-like fashion upon the projection step of classic iterative sparse
recovery algorithms such as IHT. We show for the first time that this leads both
to improved convergence bounds under standard assumptions. Furthermore, we
present extensions of this approach to the practical task of obtaining accurate sparse
DNNs, and validate it experimentally at scale for Transformer-based models on
vision and language tasks.},
  author       = {Wu, Diyuan and Modoranu, Ionut-Vlad and Safaryan, Mher and Kuznedelev, Denis and Alistarh, Dan-Adrian},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19519,
  abstract     = {There has been significant interest in "extreme" compression of large language models (LLMs), i.e. to 1-2 bits per parameter, which allows such models to be executed efficiently on resource-constrained devices. Existing work focused on improved one-shot quantization techniques and weight representations; yet, purely post-training approaches are reaching diminishing returns in terms of the accuracy-vs-bit-width trade-off. State-of-the-art quantization methods such as QuIP# and AQLM include fine-tuning (part of) the compressed parameters over a limited amount of calibration data; however, such fine-tuning techniques over compressed weights often make exclusive use of straight-through estimators (STE), whose performance is not well-understood in this setting. In this work, we question the use of STE for extreme LLM compression, showing that it can be sub-optimal, and perform a systematic study of quantization-aware fine-tuning strategies for LLMs.We propose PV-Tuning - a representation-agnostic framework that generalizes and improves upon existing fine-tuning strategies, and provides convergence guarantees in restricted cases.On the practical side, when used for 1-2 bit vector quantization, PV-Tuning outperforms prior techniques for highly-performant models such as Llama and Mistral. Using PV-Tuning, we achieve the first Pareto-optimal quantization for Llama-2 family models at 2 bits per parameter.},
  author       = {Malinovskii, Vladimir and Mazur, Denis and Ilin, Ivan and Kuznedelev, Denis and Burlachenko, Konstantin and Yi, Kai and Alistarh, Dan-Adrian and Richtarik, Peter},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  isbn         = {9798331314385},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{PV-tuning: Beyond straight-through estimation for extreme LLM compression}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{15363,
  abstract     = {Knowledge distillation is a popular approach for enhancing the performance of "student" models, with lower representational capacity, by taking advantage of more powerful "teacher" models. Despite its apparent simplicity, the underlying mechanics behind knowledge distillation (KD) are not yet fully understood. In this work, we shed new light on the inner workings of this method, by examining it from an optimization perspective. Specifically, we show that, in the context of linear and deep linear models, KD can be interpreted as a novel type of stochastic variance reduction mechanism. We provide a detailed convergence analysis of the resulting dynamics, which hold under standard assumptions for both strongly-convex and non-convex losses, showing that KD acts as a form of \emph{partial variance reduction}, which can reduce the stochastic gradient noise, but may not eliminate it completely, depending on the properties of the teacher'' model. Our analysis puts further emphasis on the need for careful parametrization of KD, in particular w.r.t. the weighting of the distillation loss, and is validated empirically on both linear models and deep neural networks.},
  author       = {Safaryan, Mher and Peste, Elena-Alexandra and Alistarh, Dan-Adrian},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {New Orleans, LA, United States},
  title        = {{Knowledge distillation performs partial variance reduction}},
  volume       = {36},
  year         = {2023},
}

@inproceedings{15364,
  abstract     = {Clustering is a fundamental problem in unsupervised machine learning with many applications in data analysis. Popular clustering algorithms such as Lloyd's algorithm and k-means++ can make Ω(ndk) time when clustering n points in a d-dimensional space (represented by an n×d matrix X) into k clusters. On massive datasets with moderate to large k, the multiplicative 
k factor can become very expensive. We introduce a simple randomized clustering algorithm that provably runs in expected time O(nnz(X)+nlogn) for arbitrary k. Here nnz(X) is the total number of non-zero entries in the input dataset X, which is upper bounded by nd and can be significantly smaller for sparse datasets. We prove that our algorithm achieves approximation ratio ˜O(k4) on any input dataset for the k-means objective, and our experiments show that the quality of the clusters found by our algorithm is usually much better than this worst-case bound. We use our algorithm for k-means clustering and for coreset construction; our experiments show that it gives a new tradeoff between running time and cluster quality compared to previous state-of-the-art methods for these tasks. Our theoretical analysis is based on novel results of independent interest. We show that the approximation ratio achieved after a random one-dimensional projection can be lifted to the original points and that k-means++ seeding can be implemented in expected time O(nlogn) in one dimension.},
  author       = {Charikar, Moses and Hu, Lunjia and Henzinger, Monika H and Vötsch, Maximilian and Waingarten, Erik},
  booktitle    = {37th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {New Orleans, LA, United States},
  title        = {{Simple, scalable and effective clustering via one-dimensional projections}},
  volume       = {36},
  year         = {2023},
}

@inproceedings{18876,
  abstract     = {Convolutional neural networks were the standard for solving many computer vision tasks until recently, when Transformers of MLP-based architectures have started to show competitive performance. These architectures typically have a vast number of weights and need to be trained on massive datasets; hence, they are not suitable for their use in low-data regimes. In this work, we propose a simple yet effective framework to improve generalization from small amounts of data. We augment modern CNNs with fully-connected (FC) layers and show the massive impact this architectural change has in low-data regimes. We further present an online joint knowledge-distillation method to utilize the extra FC layers at train time but avoid them during test time. This allows us to improve the generalization of a CNN-based model without any increase in the number of weights at test time. We perform classification experiments for a large range of network backbones and several standard datasets on supervised learning and active learning. Our experiments significantly outperform the networks without fully-connected layers, reaching a relative improvement of up to 16% validation accuracy in the supervised setting without adding any extra parameters during inference.},
  author       = {Kocsis, Peter and Súkeník, Peter and Brasó, Guillem and Niessner, Matthias and Leal-Taixé, Laura and Elezi, Ismail},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {New Orleans, LA, United States},
  pages        = {1896--1908},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{The unreasonable effectiveness of fully-connected layers for low-data regimes}},
  volume       = {35},
  year         = {2022},
}

@inproceedings{11452,
  abstract     = {We study efficient distributed algorithms for the fundamental problem of principal component analysis and leading eigenvector computation on the sphere, when the data are randomly distributed among a set of computational nodes. We propose a new quantized variant of Riemannian gradient descent to solve this problem, and prove that the algorithm converges with high probability under a set of necessary spherical-convexity properties. We give bounds on the number of bits transmitted by the algorithm under common initialization schemes, and investigate the dependency on the problem dimension in each case.},
  author       = {Alimisis, Foivos and Davies, Peter and Vandereycken, Bart and Alistarh, Dan-Adrian},
  booktitle    = {Advances in Neural Information Processing Systems - 35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {2823--2834},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Distributed principal component analysis with limited communication}},
  volume       = {4},
  year         = {2021},
}

@inproceedings{11453,
  abstract     = {Neuronal computations depend on synaptic connectivity and intrinsic electrophysiological properties. Synaptic connectivity determines which inputs from presynaptic neurons are integrated, while cellular properties determine how inputs are filtered over time. Unlike their biological counterparts, most computational approaches to learning in simulated neural networks are limited to changes in synaptic connectivity. However, if intrinsic parameters change, neural computations are altered drastically. Here, we include the parameters that determine the intrinsic properties,
e.g., time constants and reset potential, into the learning paradigm. Using sparse feedback signals that indicate target spike times, and gradient-based parameter updates, we show that the intrinsic parameters can be learned along with the synaptic weights to produce specific input-output functions. Specifically, we use a teacher-student paradigm in which a randomly initialised leaky integrate-and-fire or resonate-and-fire neuron must recover the parameters of a teacher neuron. We show that complex temporal functions can be learned online and without backpropagation through time, relying on event-based updates only. Our results are a step towards online learning of neural computations from ungraded and unsigned sparse feedback signals with a biologically inspired learning mechanism.},
  author       = {Braun, Lukas and Vogels, Tim P},
  booktitle    = {Advances in Neural Information Processing Systems - 35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {16437--16450},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Online learning of neural computations from sparse temporal feedback}},
  volume       = {20},
  year         = {2021},
}

@inproceedings{11463,
  abstract     = {Efficiently approximating local curvature information of the loss function is a key tool for optimization and compression of deep neural networks. Yet, most existing methods to approximate second-order information have high computational
or storage costs, which limits their practicality. In this work, we investigate matrix-free, linear-time approaches for estimating Inverse-Hessian Vector Products (IHVPs) for the case when the Hessian can be approximated as a sum of rank-one matrices, as in the classic approximation of the Hessian by the empirical Fisher matrix. We propose two new algorithms: the first is tailored towards network compression and can compute the IHVP for dimension d, if the Hessian is given as a sum of m rank-one matrices, using O(dm2) precomputation, O(dm) cost for computing the IHVP, and query cost O(m) for any single element of the inverse Hessian. The second algorithm targets an optimization setting, where we wish to compute the product between the inverse Hessian, estimated over a sliding window of optimization steps, and a given gradient direction, as required for preconditioned SGD. We give an algorithm with cost O(dm + m2) for computing the IHVP and O(dm + m3) for adding or removing any gradient from the sliding window. These
two algorithms yield state-of-the-art results for network pruning and optimization with lower computational overhead relative to existing second-order methods. Implementations are available at [9] and [17].},
  author       = {Frantar, Elias and Kurtic, Eldar and Alistarh, Dan-Adrian},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {14873--14886},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{M-FAC: Efficient matrix-free approximations of second-order information}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{11464,
  abstract     = {We consider a standard distributed optimisation setting where N machines, each holding a d-dimensional function
fi, aim to jointly minimise the sum of the functions ∑Ni=1fi(x). This problem arises naturally in large-scale distributed optimisation, where a standard solution is to apply variants of (stochastic) gradient descent. We focus on the communication complexity of this problem: our main result provides the first fully unconditional bounds on total number of bits which need to be sent and received by the N machines to solve this problem under point-to-point communication, within a given error-tolerance. Specifically, we show that Ω(Ndlogd/Nε) total bits need to be communicated between the machines to find an additive ϵ-approximation to the minimum of ∑Ni=1fi(x). The result holds for both deterministic and randomised algorithms, and, importantly, requires no assumptions on the algorithm structure. The lower bound is tight under certain restrictions on parameter values, and is matched within constant factors for quadratic objectives by a new variant of quantised gradient descent, which we describe and analyse. Our results bring over tools from communication complexity to distributed optimisation, which has potential for further applications.},
  author       = {Alistarh, Dan-Adrian and Korhonen, Janne},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual, Online},
  pages        = {7254--7266},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Towards tight communication lower bounds for distributed optimisation}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{10593,
  abstract     = {We study the problem of estimating a rank-$1$ signal in the presence of rotationally invariant noise-a class of perturbations more general than Gaussian noise. Principal Component Analysis (PCA) provides a natural estimator, and sharp results on its performance have been obtained in the high-dimensional regime. Recently, an Approximate Message Passing (AMP) algorithm has been proposed as an alternative estimator with the potential to improve the accuracy of PCA. However, the existing analysis of AMP requires an initialization that is both correlated with the signal and independent of the noise, which is often unrealistic in practice. In this work, we combine the two methods, and propose to initialize AMP with PCA. Our main result is a rigorous asymptotic characterization of the performance of this estimator. Both the AMP algorithm and its analysis differ from those previously derived in the Gaussian setting: at every iteration, our AMP algorithm requires a specific term to account for PCA initialization, while in the Gaussian case, PCA initialization affects only the first iteration of AMP. The proof is based on a two-phase artificial AMP that first approximates the PCA estimator and then mimics the true AMP. Our numerical simulations show an excellent agreement between AMP results and theoretical predictions, and suggest an interesting open direction on achieving Bayes-optimal performance.},
  author       = {Mondelli, Marco and Venkataramanan, Ramji},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  issn         = {1049-5258},
  location     = {Virtual},
  pages        = {29616--29629},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{PCA initialization for approximate message passing in rotationally invariant models}},
  volume       = {35},
  year         = {2021},
}