@inproceedings{14458, abstract = {We show for the first time that large-scale generative pretrained transformer (GPT) family models can be pruned to at least 50% sparsity in one-shot, without any retraining, at minimal loss of accuracy. This is achieved via a new pruning method called SparseGPT, specifically designed to work efficiently and accurately on massive GPT-family models. We can execute SparseGPT on the largest available open-source models, OPT-175B and BLOOM-176B, in under 4.5 hours, and can reach 60% unstructured sparsity with negligible increase in perplexity: remarkably, more than 100 billion weights from these models can be ignored at inference time. SparseGPT generalizes to semi-structured (2:4 and 4:8) patterns, and is compatible with weight quantization approaches. The code is available at: https://github.com/IST-DASLab/sparsegpt.}, author = {Frantar, Elias and Alistarh, Dan-Adrian}, booktitle = {Proceedings of the 40th International Conference on Machine Learning}, issn = {2640-3498}, location = {Honolulu, Hawaii, HI, United States}, pages = {10323--10337}, publisher = {ML Research Press}, title = {{SparseGPT: Massive language models can be accurately pruned in one-shot}}, volume = {202}, year = {2023}, } @inproceedings{11463, abstract = {Efficiently approximating local curvature information of the loss function is a key tool for optimization and compression of deep neural networks. Yet, most existing methods to approximate second-order information have high computational or storage costs, which limits their practicality. In this work, we investigate matrix-free, linear-time approaches for estimating Inverse-Hessian Vector Products (IHVPs) for the case when the Hessian can be approximated as a sum of rank-one matrices, as in the classic approximation of the Hessian by the empirical Fisher matrix. We propose two new algorithms: the first is tailored towards network compression and can compute the IHVP for dimension d, if the Hessian is given as a sum of m rank-one matrices, using O(dm2) precomputation, O(dm) cost for computing the IHVP, and query cost O(m) for any single element of the inverse Hessian. The second algorithm targets an optimization setting, where we wish to compute the product between the inverse Hessian, estimated over a sliding window of optimization steps, and a given gradient direction, as required for preconditioned SGD. We give an algorithm with cost O(dm + m2) for computing the IHVP and O(dm + m3) for adding or removing any gradient from the sliding window. These two algorithms yield state-of-the-art results for network pruning and optimization with lower computational overhead relative to existing second-order methods. Implementations are available at [9] and [17].}, author = {Frantar, Elias and Kurtic, Eldar and Alistarh, Dan-Adrian}, booktitle = {35th Conference on Neural Information Processing Systems}, isbn = {9781713845393}, issn = {1049-5258}, location = {Virtual, Online}, pages = {14873--14886}, publisher = {Curran Associates}, title = {{M-FAC: Efficient matrix-free approximations of second-order information}}, volume = {34}, year = {2021}, } @inproceedings{8724, abstract = {We study the problem of learning from multiple untrusted data sources, a scenario of increasing practical relevance given the recent emergence of crowdsourcing and collaborative learning paradigms. Specifically, we analyze the situation in which a learning system obtains datasets from multiple sources, some of which might be biased or even adversarially perturbed. It is known that in the single-source case, an adversary with the power to corrupt a fixed fraction of the training data can prevent PAC-learnability, that is, even in the limit of infinitely much training data, no learning system can approach the optimal test error. In this work we show that, surprisingly, the same is not true in the multi-source setting, where the adversary can arbitrarily corrupt a fixed fraction of the data sources. Our main results are a generalization bound that provides finite-sample guarantees for this learning setting, as well as corresponding lower bounds. Besides establishing PAC-learnability our results also show that in a cooperative learning setting sharing data with other parties has provable benefits, even if some participants are malicious. }, author = {Konstantinov, Nikola H and Frantar, Elias and Alistarh, Dan-Adrian and Lampert, Christoph}, booktitle = {Proceedings of the 37th International Conference on Machine Learning}, issn = {2640-3498}, location = {Online}, pages = {5416--5425}, publisher = {ML Research Press}, title = {{On the sample complexity of adversarial multi-source PAC learning}}, volume = {119}, year = {2020}, }