[{"citation":{"ieee":"E. Kurtic, T. Hoefler, and D.-A. Alistarh, “How to prune your language model: Recovering accuracy on the ‘Sparsity May Cry’ benchmark,” in <i>Proceedings of Machine Learning Research</i>, Hongkong, China, 2024, vol. 234, pp. 542–553.","chicago":"Kurtic, Eldar, Torsten Hoefler, and Dan-Adrian Alistarh. “How to Prune Your Language Model: Recovering Accuracy on the ‘Sparsity May Cry’ Benchmark.” In <i>Proceedings of Machine Learning Research</i>, 234:542–53. ML Research Press, 2024.","ista":"Kurtic E, Hoefler T, Alistarh D-A. 2024. How to prune your language model: Recovering accuracy on the ‘Sparsity May Cry’ benchmark. Proceedings of Machine Learning Research. CPAL: Conference on Parsimony and Learning, PMLR, vol. 234, 542–553.","mla":"Kurtic, Eldar, et al. “How to Prune Your Language Model: Recovering Accuracy on the ‘Sparsity May Cry’ Benchmark.” <i>Proceedings of Machine Learning Research</i>, vol. 234, ML Research Press, 2024, pp. 542–53.","short":"E. Kurtic, T. Hoefler, D.-A. Alistarh, in:, Proceedings of Machine Learning Research, ML Research Press, 2024, pp. 542–553.","ama":"Kurtic E, Hoefler T, Alistarh D-A. How to prune your language model: Recovering accuracy on the “Sparsity May Cry” benchmark. In: <i>Proceedings of Machine Learning Research</i>. Vol 234. ML Research Press; 2024:542-553.","apa":"Kurtic, E., Hoefler, T., &#38; Alistarh, D.-A. (2024). How to prune your language model: Recovering accuracy on the “Sparsity May Cry” benchmark. In <i>Proceedings of Machine Learning Research</i> (Vol. 234, pp. 542–553). Hongkong, China: ML Research Press."},"publisher":"ML Research Press","volume":234,"oa":1,"publication_identifier":{"eissn":["2640-3498"]},"intvolume":"       234","conference":{"name":"CPAL: Conference on Parsimony and Learning","end_date":"2024-01-06","start_date":"2024-01-03","location":"Hongkong, China"},"quality_controlled":"1","page":"542-553","month":"01","type":"conference","external_id":{"arxiv":["2312.13547"]},"publication_status":"published","publication":"Proceedings of Machine Learning Research","status":"public","oa_version":"Preprint","date_updated":"2024-10-09T21:08:16Z","day":"08","alternative_title":["PMLR"],"article_processing_charge":"No","date_published":"2024-01-08T00:00:00Z","title":"How to prune your language model: Recovering accuracy on the \"Sparsity May Cry\" benchmark","abstract":[{"text":"Pruning large language models (LLMs) from the BERT family has emerged as a standard compression benchmark, and several pruning methods have been proposed for this task. The recent “Sparsity May Cry” (SMC) benchmark put into question the validity of all existing methods, exhibiting a more complex setup where many known pruning methods appear to fail. We revisit the question of accurate BERT-pruning during fine-tuning on downstream datasets, and propose a set of general guidelines for successful pruning, even on the challenging SMC benchmark. First, we perform a cost-vs-benefits analysis of pruning model components, such as the embeddings and the classification head; second, we provide a simple-yet-general way of scaling training, sparsification and learning rate schedules relative to the desired target sparsity; finally, we investigate the importance of proper parametrization for Knowledge Distillation in the context of LLMs. Our simple insights lead to state-of-the-art results, both on classic BERT-pruning benchmarks, as well as on the SMC benchmark, showing that even classic gradual magnitude pruning (GMP) can yield competitive results, with the right approach.","lang":"eng"}],"arxiv":1,"scopus_import":"1","_id":"15011","author":[{"last_name":"Kurtic","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","full_name":"Kurtic, Eldar","first_name":"Eldar"},{"full_name":"Hoefler, Torsten","last_name":"Hoefler","first_name":"Torsten"},{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian"}],"date_created":"2024-02-18T23:01:03Z","department":[{"_id":"DaAl"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2024","corr_author":"1","language":[{"iso":"eng"}],"main_file_link":[{"open_access":"1","url":"https://proceedings.mlr.press/v234/kurtic24a"}]},{"arxiv":1,"abstract":[{"text":"Federated Learning (FL) enables large-scale distributed training of machine learning models, while still allowing individual nodes to maintain data locally. However, executing FL at scale comes with inherent practical challenges: 1) heterogeneity of the local node data distributions, 2) heterogeneity of node computational speeds (asynchrony), but also 3) constraints in the amount of communication between the clients and the server. In this work, we present the first variant of the classic federated averaging (FedAvg) algorithm which, at the same time, supports data heterogeneity, partial client asynchrony, and communication compression. Our algorithm comes with a novel, rigorous analysis showing that, in spite of these system relaxations, it can provide similar convergence to FedAvg in interesting parameter regimes. Experimental results in the rigorous LEAF benchmark on setups of up to 300 nodes show that our algorithm ensures fast convergence for standard federated tasks, improving upon prior quantized and asynchronous approaches.","lang":"eng"}],"title":"Communication-efficient federated learning with data and client heterogeneity","article_processing_charge":"No","date_published":"2024-05-01T00:00:00Z","day":"01","date_updated":"2024-10-09T21:08:57Z","alternative_title":["PMLR"],"oa_version":"Preprint","publication_status":"published","status":"public","publication":"Proceedings of the 27th International Conference on Artificial Intelligence and Statistics","external_id":{"arxiv":["2206.10032"]},"month":"05","type":"conference","page":"3448-3456","quality_controlled":"1","intvolume":"       238","conference":{"location":"Valencia, Spain","end_date":"2024-05-04","name":"AISTATS: Conference on Artificial Intelligence and Statistics","start_date":"2024-05-02"},"publication_identifier":{"eissn":["2640-3498"]},"oa":1,"publisher":"ML Research Press","volume":238,"citation":{"apa":"Zakerinia, H., Talaei, S., Nadiradze, G., &#38; Alistarh, D.-A. (2024). Communication-efficient federated learning with data and client heterogeneity. In <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i> (Vol. 238, pp. 3448–3456). Valencia, Spain: ML Research Press.","ama":"Zakerinia H, Talaei S, Nadiradze G, Alistarh D-A. Communication-efficient federated learning with data and client heterogeneity. In: <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>. Vol 238. ML Research Press; 2024:3448-3456.","short":"H. Zakerinia, S. Talaei, G. Nadiradze, D.-A. Alistarh, in:, Proceedings of the 27th International Conference on Artificial Intelligence and Statistics, ML Research Press, 2024, pp. 3448–3456.","mla":"Zakerinia, Hossein, et al. “Communication-Efficient Federated Learning with Data and Client Heterogeneity.” <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, vol. 238, ML Research Press, 2024, pp. 3448–56.","ista":"Zakerinia H, Talaei S, Nadiradze G, Alistarh D-A. 2024. Communication-efficient federated learning with data and client heterogeneity. Proceedings of the 27th International Conference on Artificial Intelligence and Statistics. AISTATS: Conference on Artificial Intelligence and Statistics, PMLR, vol. 238, 3448–3456.","chicago":"Zakerinia, Hossein, Shayan Talaei, Giorgi Nadiradze, and Dan-Adrian Alistarh. “Communication-Efficient Federated Learning with Data and Client Heterogeneity.” In <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, 238:3448–56. ML Research Press, 2024.","ieee":"H. Zakerinia, S. Talaei, G. Nadiradze, and D.-A. Alistarh, “Communication-efficient federated learning with data and client heterogeneity,” in <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, Valencia, Spain, 2024, vol. 238, pp. 3448–3456."},"language":[{"iso":"eng"}],"main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2206.10032"}],"corr_author":"1","year":"2024","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_created":"2024-06-02T22:00:57Z","department":[{"_id":"DaAl"},{"_id":"ChLa"}],"author":[{"first_name":"Hossein","last_name":"Zakerinia","id":"653bd8b6-f394-11eb-9cf6-c0bbf6cd78d4","full_name":"Zakerinia, Hossein"},{"first_name":"Shayan","full_name":"Talaei, Shayan","last_name":"Talaei"},{"full_name":"Nadiradze, Giorgi","id":"3279A00C-F248-11E8-B48F-1D18A9856A87","last_name":"Nadiradze","orcid":"0000-0001-5634-0731","first_name":"Giorgi"},{"last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X"}],"_id":"17093","scopus_import":"1"},{"abstract":[{"text":"Leveraging second-order information about the loss at the scale of deep networks is one of the main lines of approach for improving the performance of current optimizers for deep learning. Yet, existing approaches for accurate full-matrix preconditioning, such as Full-Matrix Adagrad (GGT) or Matrix-Free Approximate Curvature (M-FAC) suffer from massive storage costs when applied even to small-scale models, as they must store a sliding window of gradients, whose memory requirements are multiplicative in the model dimension. In this paper, we address this issue via a novel and efficient error-feedback technique that can be applied to compress preconditioners by up to two orders of magnitude in practice, without loss of convergence. Specifically, our approach compresses the gradient information via sparsification or low-rank compression before it is fed into the preconditioner, feeding the compression error back into future iterations. Extensive experiments on deep neural networks show that this approach can compress full-matrix preconditioners to up to 99% sparsity without accuracy loss, effectively removing the memory overhead of fullmatrix preconditioners such as GGT and M-FAC.","lang":"eng"}],"arxiv":1,"title":"Error feedback can accurately compress preconditioners","article_processing_charge":"No","date_published":"2024-07-30T00:00:00Z","acknowledged_ssus":[{"_id":"CampIT"}],"oa_version":"Preprint","alternative_title":["PMLR"],"day":"30","date_updated":"2025-01-30T07:54:16Z","status":"public","publication":"41st International Conference on Machine Learning","publication_status":"published","page":"35910-35933","external_id":{"arxiv":["2306.06098"]},"month":"07","type":"conference","quality_controlled":"1","publication_identifier":{"eissn":["2640-3498"]},"intvolume":"       235","OA_place":"repository","conference":{"location":"Vienna, Austria","end_date":"2024-07-27","name":"ICML: International Conference on Machine Learning","start_date":"2024-07-21"},"publisher":"ML Research Press","volume":235,"oa":1,"citation":{"short":"I.-V. Modoranu, A. Kalinov, E. Kurtic, E. Frantar, D.-A. Alistarh, in:, 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 35910–35933.","mla":"Modoranu, Ionut-Vlad, et al. “Error Feedback Can Accurately Compress Preconditioners.” <i>41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 35910–33.","chicago":"Modoranu, Ionut-Vlad, Aleksei Kalinov, Eldar Kurtic, Elias Frantar, and Dan-Adrian Alistarh. “Error Feedback Can Accurately Compress Preconditioners.” In <i>41st International Conference on Machine Learning</i>, 235:35910–33. ML Research Press, 2024.","ista":"Modoranu I-V, Kalinov A, Kurtic E, Frantar E, Alistarh D-A. 2024. Error feedback can accurately compress preconditioners. 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 235, 35910–35933.","ieee":"I.-V. Modoranu, A. Kalinov, E. Kurtic, E. Frantar, and D.-A. Alistarh, “Error feedback can accurately compress preconditioners,” in <i>41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 35910–35933.","apa":"Modoranu, I.-V., Kalinov, A., Kurtic, E., Frantar, E., &#38; Alistarh, D.-A. (2024). Error feedback can accurately compress preconditioners. In <i>41st International Conference on Machine Learning</i> (Vol. 235, pp. 35910–35933). Vienna, Austria: ML Research Press.","ama":"Modoranu I-V, Kalinov A, Kurtic E, Frantar E, Alistarh D-A. Error feedback can accurately compress preconditioners. In: <i>41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:35910-35933."},"main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2306.06098"}],"language":[{"iso":"eng"}],"OA_type":"green","corr_author":"1","year":"2024","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","department":[{"_id":"DaAl"}],"date_created":"2025-01-30T07:53:22Z","author":[{"id":"449f7a18-f128-11eb-9611-9b430c0c6333","last_name":"Modoranu","full_name":"Modoranu, Ionut-Vlad","first_name":"Ionut-Vlad"},{"first_name":"Aleksei","orcid":"0000-0003-2189-3904","id":"44b7120e-eb97-11eb-a6c2-e1557aa81d02","last_name":"Kalinov","full_name":"Kalinov, Aleksei"},{"first_name":"Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","last_name":"Kurtic","full_name":"Kurtic, Eldar"},{"full_name":"Frantar, Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","last_name":"Frantar","first_name":"Elias"},{"orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"_id":"18975","scopus_import":"1","acknowledgement":"The authors thank Adrian Vladu, Razvan Pascanu, Alexandra Peste, Mher Safaryan for their valuable feedback, the IT department from Institute of Science and Technology Austria for the hardware support and Weights and Biases for the infrastructure to track all our experiments."},{"ec_funded":1,"arxiv":1,"abstract":[{"lang":"eng","text":"We analyze asynchronous-type algorithms for distributed SGD in the heterogeneous setting, where each worker has its own computation and communication speeds, as well as data distribution. In these algorithms, workers compute possibly stale and stochastic gradients associated with their local data at some iteration back in history and then return those gradients to the server without synchronizing with other workers. We present a unified convergence theory for non-convex smooth functions in the heterogeneous regime. The proposed analysis provides convergence for pure asynchronous SGD and its various modifications. Moreover, our theory explains what affects the convergence rate and what can be done to improve the performance of asynchronous algorithms. In particular, we introduce a novel asynchronous method based on worker shuffling. As a by-product of our analysis, we also demonstrate convergence guarantees for gradient-type algorithms such as SGD with random reshuffling and shuffle-once mini-batch SGD. The derived rates match the best-known results for those algorithms, highlighting the tightness of our approach. Finally, our numerical evaluations support theoretical findings and show the good practical performance of our method."}],"title":"AsGrad: A sharp unified analysis of asynchronous-SGD algorithms","alternative_title":["PMLR"],"day":"15","date_updated":"2025-04-14T07:54:52Z","oa_version":"Preprint","article_processing_charge":"No","date_published":"2024-05-15T00:00:00Z","publication":"Proceedings of The 27th International Conference on Artificial Intelligence and Statistics","publication_status":"published","status":"public","quality_controlled":"1","type":"conference","external_id":{"arxiv":["2310.20452"]},"month":"05","page":"649-657","volume":238,"oa":1,"publisher":"ML Research Press","conference":{"end_date":"2024-05-04","name":"AISTATS: Conference on Artificial Intelligence and Statistics","start_date":"2024-05-02","location":"Valencia, Spain"},"OA_place":"repository","intvolume":"       238","publication_identifier":{"eissn":["2640-3498"]},"project":[{"name":"IST-BRIDGE: International postdoctoral program","call_identifier":"H2020","_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c","grant_number":"101034413"}],"citation":{"ieee":"R. Islamov, M. Safaryan, and D.-A. Alistarh, “AsGrad: A sharp unified analysis of asynchronous-SGD algorithms,” in <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i>, Valencia, Spain, 2024, vol. 238, pp. 649–657.","ista":"Islamov R, Safaryan M, Alistarh D-A. 2024. AsGrad: A sharp unified analysis of asynchronous-SGD algorithms. Proceedings of The 27th International Conference on Artificial Intelligence and Statistics. AISTATS: Conference on Artificial Intelligence and Statistics, PMLR, vol. 238, 649–657.","chicago":"Islamov, Rustem, Mher Safaryan, and Dan-Adrian Alistarh. “AsGrad: A Sharp Unified Analysis of Asynchronous-SGD Algorithms.” In <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i>, 238:649–57. ML Research Press, 2024.","mla":"Islamov, Rustem, et al. “AsGrad: A Sharp Unified Analysis of Asynchronous-SGD Algorithms.” <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i>, vol. 238, ML Research Press, 2024, pp. 649–57.","short":"R. Islamov, M. Safaryan, D.-A. Alistarh, in:, Proceedings of The 27th International Conference on Artificial Intelligence and Statistics, ML Research Press, 2024, pp. 649–657.","ama":"Islamov R, Safaryan M, Alistarh D-A. AsGrad: A sharp unified analysis of asynchronous-SGD algorithms. In: <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i>. Vol 238. ML Research Press; 2024:649-657.","apa":"Islamov, R., Safaryan, M., &#38; Alistarh, D.-A. (2024). AsGrad: A sharp unified analysis of asynchronous-SGD algorithms. In <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i> (Vol. 238, pp. 649–657). Valencia, Spain: ML Research Press."},"main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2310.20452"}],"language":[{"iso":"eng"}],"corr_author":"1","OA_type":"green","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2024","author":[{"first_name":"Rustem","full_name":"Islamov, Rustem","last_name":"Islamov"},{"first_name":"Mher","id":"dd546b39-0804-11ed-9c55-ef075c39778d","last_name":"Safaryan","full_name":"Safaryan, Mher"},{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian"}],"date_created":"2025-01-30T08:15:49Z","department":[{"_id":"DaAl"}],"_id":"18976","acknowledgement":"The authors thank all anonymous reviewers for their valuable comments and suggestions on how to improve the manuscript. This work was done when Rustem Islamov was a Master’s student at Institut Polytechnique de Paris (IP Paris) and an intern at Institute of Science and Technology Austria (ISTA). The research of Rustem Islamov was supported by ISTA internship\r\nprogram. Mher Safaryan has received funding from the European Union’s Horizon 2020 research and innovation program under the Marie Skłodowska-Curie grant agreement No 101034413.","scopus_import":"1"},{"abstract":[{"lang":"eng","text":"Recent advances in large language model (LLM) pretraining have led to high-quality LLMs with impressive abilities. By compressing such LLMs via quantization to 3-4 bits per parameter, they can fit into memory-limited devices such as laptops and mobile phones, enabling personalized use. Quantizing models to 3-4 bits per parameter can lead to moderate to high accuracy losses, especially for smaller models (1-10B parameters), which are suitable for edge deployment. To address this accuracy issue, we introduce the Sparse-Quantized Representation (SpQR), a new compressed format and quantization technique that enables for the first time \\emph{near-lossless} compression of LLMs across model scales while reaching similar compression levels to previous methods. SpQR works by identifying and isolating \\emph{outlier weights}, which cause particularly large quantization errors, and storing them in higher precision while compressing all other weights to 3-4 bits, and achieves relative accuracy losses of less than \r\n in perplexity for highly-accurate LLaMA and Falcon LLMs. This makes it possible to run a 33B parameter LLM on a single 24 GB consumer GPU without performance degradation at 15% speedup, thus making powerful LLMs available to consumers without any downsides. SpQR comes with efficient algorithms for both encoding weights into its format, as well as decoding them efficiently at runtime. Specifically, we provide an efficient GPU inference algorithm for SpQR, which yields faster inference than 16-bit baselines at similar accuracy while enabling memory compression gains of more than 4x."}],"arxiv":1,"title":"SpQR: A sparse-quantized representation for near-lossless LLM weight compression","oa_version":"Preprint","day":"15","date_updated":"2025-01-30T08:27:47Z","article_processing_charge":"No","date_published":"2024-05-15T00:00:00Z","publication_status":"published","status":"public","publication":"12th International Conference on Learning Representations","quality_controlled":"1","external_id":{"arxiv":["2306.03078"]},"month":"05","type":"conference","oa":1,"publisher":"OpenReview","OA_place":"repository","conference":{"location":"Vienna, Austria","start_date":"2024-05-07","end_date":"2024-05-11","name":"ICLR: International Conference on Learning Representations"},"citation":{"mla":"Dettmers, Tim, et al. “SpQR: A Sparse-Quantized Representation for near-Lossless LLM Weight Compression.” <i>12th International Conference on Learning Representations</i>, OpenReview, 2024.","short":"T. Dettmers, R.A. Svirschevski, V. Egiazarian, D. Kuznedelev, E. Frantar, S. Ashkboos, A. Borzunov, T. Hoefler, D.-A. Alistarh, in:, 12th International Conference on Learning Representations, OpenReview, 2024.","ieee":"T. Dettmers <i>et al.</i>, “SpQR: A sparse-quantized representation for near-lossless LLM weight compression,” in <i>12th International Conference on Learning Representations</i>, Vienna, Austria, 2024.","chicago":"Dettmers, Tim, Ruslan A. Svirschevski, Vage Egiazarian, Denis Kuznedelev, Elias Frantar, Saleh Ashkboos, Alexander Borzunov, Torsten Hoefler, and Dan-Adrian Alistarh. “SpQR: A Sparse-Quantized Representation for near-Lossless LLM Weight Compression.” In <i>12th International Conference on Learning Representations</i>. OpenReview, 2024.","ista":"Dettmers T, Svirschevski RA, Egiazarian V, Kuznedelev D, Frantar E, Ashkboos S, Borzunov A, Hoefler T, Alistarh D-A. 2024. SpQR: A sparse-quantized representation for near-lossless LLM weight compression. 12th International Conference on Learning Representations. ICLR: International Conference on Learning Representations.","apa":"Dettmers, T., Svirschevski, R. A., Egiazarian, V., Kuznedelev, D., Frantar, E., Ashkboos, S., … Alistarh, D.-A. (2024). SpQR: A sparse-quantized representation for near-lossless LLM weight compression. In <i>12th International Conference on Learning Representations</i>. Vienna, Austria: OpenReview.","ama":"Dettmers T, Svirschevski RA, Egiazarian V, et al. SpQR: A sparse-quantized representation for near-lossless LLM weight compression. In: <i>12th International Conference on Learning Representations</i>. OpenReview; 2024."},"language":[{"iso":"eng"}],"main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2306.03078"}],"OA_type":"green","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2024","author":[{"full_name":"Dettmers, Tim","last_name":"Dettmers","first_name":"Tim"},{"first_name":"Ruslan A.","last_name":"Svirschevski","full_name":"Svirschevski, Ruslan A."},{"first_name":"Vage","last_name":"Egiazarian","full_name":"Egiazarian, Vage"},{"first_name":"Denis","last_name":"Kuznedelev","full_name":"Kuznedelev, Denis"},{"first_name":"Elias","last_name":"Frantar","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","full_name":"Frantar, Elias"},{"first_name":"Saleh","full_name":"Ashkboos, Saleh","last_name":"Ashkboos"},{"first_name":"Alexander","full_name":"Borzunov, Alexander","last_name":"Borzunov"},{"full_name":"Hoefler, Torsten","last_name":"Hoefler","first_name":"Torsten"},{"orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"department":[{"_id":"DaAl"}],"date_created":"2025-01-30T08:26:59Z","_id":"18977","acknowledgement":"Denis Kuznedelev acknowledges the support from the Russian Ministry of Science and Higher\r\nEducation, grant No. 075-10-2021-068. Ruslan Svirschevski and Vage Egiazarian and Denis\r\nKuznedelev were supported by the grant for research centers in the field of AI provided by the\r\nAnalytical Center for the Government of the Russian Federation (ACRF) in accordance with the\r\nagreement on the provision of subsidies (identifier of the agreement 000000D730321P5Q0002) and the agreement with HSE University No. 70-2021-00139.","scopus_import":"1"},{"project":[{"call_identifier":"H2020","_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c","grant_number":"101034413","name":"IST-BRIDGE: International postdoctoral program"}],"citation":{"ama":"Modoranu I-V, Safaryan M, Malinovsky G, et al. MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","apa":"Modoranu, I.-V., Safaryan, M., Malinovsky, G., Kurtic, E., Robert, T., Richtárik, P., &#38; Alistarh, D.-A. (2024). MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Neural Information Processing Systems Foundation.","chicago":"Modoranu, Ionut-Vlad, Mher Safaryan, Grigory Malinovsky, Eldar Kurtic, Thomas Robert, Peter Richtárik, and Dan-Adrian Alistarh. “MICROADAM: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","ista":"Modoranu I-V, Safaryan M, Malinovsky G, Kurtic E, Robert T, Richtárik P, Alistarh D-A. 2024. MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence. 38th Conference on Neural Information Processing Systems. , Advances in Neural Information Processing Systems, vol. 37.","ieee":"I.-V. Modoranu <i>et al.</i>, “MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence,” in <i>38th Conference on Neural Information Processing Systems</i>, 2024, vol. 37.","short":"I.-V. Modoranu, M. Safaryan, G. Malinovsky, E. Kurtic, T. Robert, P. Richtárik, D.-A. Alistarh, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","mla":"Modoranu, Ionut-Vlad, et al. “MICROADAM: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024."},"month":"12","type":"conference","external_id":{"arxiv":["2405.15593"]},"quality_controlled":"1","publication_identifier":{"issn":["1049-5258"]},"intvolume":"        37","OA_place":"repository","publisher":"Neural Information Processing Systems Foundation","volume":37,"oa":1,"date_published":"2024-12-20T00:00:00Z","article_processing_charge":"No","oa_version":"Preprint","acknowledged_ssus":[{"_id":"CampIT"}],"date_updated":"2025-05-14T11:32:52Z","alternative_title":["Advances in Neural Information Processing Systems"],"day":"20","publication_status":"published","status":"public","publication":"38th Conference on Neural Information Processing Systems","abstract":[{"lang":"eng","text":"We propose a new variant of the Adam optimizer [Kingma and Ba, 2014] called\r\nMICROADAM that specifically minimizes memory overheads, while maintaining\r\ntheoretical convergence guarantees. We achieve this by compressing the gradient\r\ninformation before it is fed into the optimizer state, thereby reducing its memory\r\nfootprint significantly. We control the resulting compression error via a novel\r\ninstance of the classical error feedback mechanism from distributed optimization [Seide et al., 2014, Alistarh et al., 2018, Karimireddy et al., 2019] in which\r\nthe error correction information is itself compressed to allow for practical memory\r\ngains. We prove that the resulting approach maintains theoretical convergence\r\nguarantees competitive to those of AMSGrad, while providing good practical performance. Specifically, we show that MICROADAM can be implemented efficiently\r\non GPUs: on both million-scale (BERT) and billion-scale (LLaMA) models, MICROADAM provides practical convergence competitive to that of the uncompressed\r\nAdam baseline, with lower memory usage and similar running time. Our code is\r\navailable at https://github.com/IST-DASLab/MicroAdam."}],"arxiv":1,"ec_funded":1,"title":"MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence","_id":"19510","scopus_import":"1","acknowledgement":"The authors thank Razvan Pascanu, Mahdi Nikdan and Soroush Tabesh for their valuable feedback, the IT department from Institute of Science and Technology Austria for the hardware support and Weights and Biases for the infrastructure to track all our experiments. Mher Safaryan has received funding from the European Union’s Horizon 2020 research and innovation program under the Marie Sklodowska-Curie grant agreement No 101034413.","department":[{"_id":"DaAl"}],"date_created":"2025-04-06T22:01:32Z","author":[{"first_name":"Ionut-Vlad","id":"449f7a18-f128-11eb-9611-9b430c0c6333","last_name":"Modoranu","full_name":"Modoranu, Ionut-Vlad"},{"first_name":"Mher","full_name":"Safaryan, Mher","id":"dd546b39-0804-11ed-9c55-ef075c39778d","last_name":"Safaryan"},{"full_name":"Malinovsky, Grigory","last_name":"Malinovsky","first_name":"Grigory"},{"first_name":"Eldar","full_name":"Kurtic, Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","last_name":"Kurtic"},{"full_name":"Robert, Thomas","id":"de632733-1457-11f0-ae22-b5914b8c1c41","last_name":"Robert","first_name":"Thomas"},{"first_name":"Peter","full_name":"Richtárik, Peter","last_name":"Richtárik"},{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian"}],"year":"2024","related_material":{"link":[{"url":"https://github.com/IST-DASLab/MicroAdam","relation":"software"}]},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","language":[{"iso":"eng"}],"main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2405.15593"}],"OA_type":"green","corr_author":"1"},{"external_id":{"arxiv":["2404.00456"]},"month":"12","type":"conference","quality_controlled":"1","publication_identifier":{"issn":["1049-5258"]},"intvolume":"        37","conference":{"location":"Vancouver, Canada","start_date":"2024-12-09","name":"NeurIPS: Neural Information Processing Systems","end_date":"2024-12-15"},"OA_place":"repository","publisher":"Neural Information Processing Systems Foundation","volume":37,"oa":1,"citation":{"ieee":"S. Ashkboos <i>et al.</i>, “QuaRot: Outlier-free 4-bit inference in rotated LLMs,” in <i>38th Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","ista":"Ashkboos S, Mohtashami A, Croci ML, Li B, Cameron P, Jaggi M, Alistarh D-A, Hoefler T, Hensman J. 2024. QuaRot: Outlier-free 4-bit inference in rotated LLMs. 38th Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","chicago":"Ashkboos, Saleh, Amirkeivan Mohtashami, Maximilian L. Croci, Bo Li, Pashmina Cameron, Martin Jaggi, Dan-Adrian Alistarh, Torsten Hoefler, and James Hensman. “QuaRot: Outlier-Free 4-Bit Inference in Rotated LLMs.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","mla":"Ashkboos, Saleh, et al. “QuaRot: Outlier-Free 4-Bit Inference in Rotated LLMs.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024.","short":"S. Ashkboos, A. Mohtashami, M.L. Croci, B. Li, P. Cameron, M. Jaggi, D.-A. Alistarh, T. Hoefler, J. Hensman, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","ama":"Ashkboos S, Mohtashami A, Croci ML, et al. QuaRot: Outlier-free 4-bit inference in rotated LLMs. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","apa":"Ashkboos, S., Mohtashami, A., Croci, M. L., Li, B., Cameron, P., Jaggi, M., … Hensman, J. (2024). QuaRot: Outlier-free 4-bit inference in rotated LLMs. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation."},"abstract":[{"lang":"eng","text":"We introduce QuaRot, a new Quantization scheme based on Rotations, which is able to quantize LLMs end-to-end, including all weights, activations, and KV cache in 4 bits. QuaRot rotates LLMs in a way that removes outliers from the hidden state without changing the output, making quantization easier. This computational invariance is applied to the hidden state (residual) of the LLM, as well as to the activations of the feed-forward components, aspects of the attention mechanism, and to the KV cache. The result is a quantized model where all matrix multiplications are performed in 4 bits, without any channels identified for retention in higher precision. Our 4-bit quantized LLAMA2-70B model has losses of at most 0.47 WikiText-2 perplexity and retains 99% of the zero-shot performance. We also show that QuaRot can provide lossless 6 and 8 bit LLAMA-2 models without any calibration data using round-to-nearest quantization. Code is available at github.com/spcl/QuaRot."}],"arxiv":1,"title":"QuaRot: Outlier-free 4-bit inference in rotated LLMs","date_published":"2024-12-20T00:00:00Z","article_processing_charge":"No","oa_version":"Preprint","day":"20","date_updated":"2025-05-14T11:33:12Z","alternative_title":["Advances in Neural Information Processing Systems"],"status":"public","publication_status":"published","publication":"38th Conference on Neural Information Processing Systems","date_created":"2025-04-06T22:01:32Z","department":[{"_id":"DaAl"}],"author":[{"full_name":"Ashkboos, Saleh","last_name":"Ashkboos","first_name":"Saleh"},{"first_name":"Amirkeivan","full_name":"Mohtashami, Amirkeivan","last_name":"Mohtashami"},{"last_name":"Croci","full_name":"Croci, Maximilian L.","first_name":"Maximilian L."},{"last_name":"Li","full_name":"Li, Bo","first_name":"Bo"},{"last_name":"Cameron","full_name":"Cameron, Pashmina","first_name":"Pashmina"},{"full_name":"Jaggi, Martin","last_name":"Jaggi","first_name":"Martin"},{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian"},{"first_name":"Torsten","full_name":"Hoefler, Torsten","last_name":"Hoefler"},{"first_name":"James","last_name":"Hensman","full_name":"Hensman, James"}],"_id":"19511","scopus_import":"1","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2404.00456"}],"language":[{"iso":"eng"}],"OA_type":"green","year":"2024","related_material":{"link":[{"url":"https://github.com/spcl/QuaRot","relation":"software"}]},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87"},{"year":"2024","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2408.17163"}],"language":[{"iso":"eng"}],"corr_author":"1","OA_type":"green","_id":"19518","scopus_import":"1","acknowledgement":"The authors thank the anonymous NeurIPS reviewers for their useful comments and feedback, the IT department from the Institute of Science and Technology Austria for the hardware support, and Weights and Biases for the infrastructure to track all our experiments. Mher Safaryan has received funding from the European Union’s Horizon 2020 research and innovation program under the Maria Skłodowska-Curie grant agreement No 101034413.","date_created":"2025-04-06T22:01:32Z","department":[{"_id":"DaAl"},{"_id":"MaMo"}],"author":[{"first_name":"Diyuan","last_name":"Wu","id":"1a5914c2-896a-11ed-bdf8-fb80621a0635","full_name":"Wu, Diyuan"},{"first_name":"Ionut-Vlad","full_name":"Modoranu, Ionut-Vlad","id":"449f7a18-f128-11eb-9611-9b430c0c6333","last_name":"Modoranu"},{"first_name":"Mher","full_name":"Safaryan, Mher","last_name":"Safaryan","id":"dd546b39-0804-11ed-9c55-ef075c39778d"},{"last_name":"Kuznedelev","full_name":"Kuznedelev, Denis","first_name":"Denis"},{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian"}],"article_processing_charge":"No","date_published":"2024-12-20T00:00:00Z","acknowledged_ssus":[{"_id":"CampIT"}],"oa_version":"Preprint","day":"20","date_updated":"2025-05-14T11:37:10Z","alternative_title":["Advances in Neural Information Processing Systems"],"publication_status":"published","status":"public","publication":"38th Conference on Neural Information Processing Systems","abstract":[{"text":"The rising footprint of machine learning has led to a focus on imposing model\r\nsparsity as a means of reducing computational and memory costs. For deep neural\r\nnetworks (DNNs), the state-of-the-art accuracy-vs-sparsity is achieved by heuristics\r\ninspired by the classical Optimal Brain Surgeon (OBS) framework [LeCun et al.,\r\n1989, Hassibi and Stork, 1992, Hassibi et al., 1993], which leverages loss curvature\r\ninformation to make better pruning decisions. Yet, these results still lack a solid\r\ntheoretical understanding, and it is unclear whether they can be improved by\r\nleveraging connections to the wealth of work on sparse recovery algorithms. In this\r\npaper, we draw new connections between these two areas and present new sparse\r\nrecovery algorithms inspired by the OBS framework that comes with theoretical\r\nguarantees under reasonable assumptions and have strong practical performance.\r\nSpecifically, our work starts from the observation that we can leverage curvature\r\ninformation in OBS-like fashion upon the projection step of classic iterative sparse\r\nrecovery algorithms such as IHT. We show for the first time that this leads both\r\nto improved convergence bounds under standard assumptions. Furthermore, we\r\npresent extensions of this approach to the practical task of obtaining accurate sparse\r\nDNNs, and validate it experimentally at scale for Transformer-based models on\r\nvision and language tasks.","lang":"eng"}],"arxiv":1,"ec_funded":1,"title":"The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information","project":[{"name":"IST-BRIDGE: International postdoctoral program","grant_number":"101034413","_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c","call_identifier":"H2020"}],"citation":{"ama":"Wu D, Modoranu I-V, Safaryan M, Kuznedelev D, Alistarh D-A. The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","apa":"Wu, D., Modoranu, I.-V., Safaryan, M., Kuznedelev, D., &#38; Alistarh, D.-A. (2024). The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation.","ista":"Wu D, Modoranu I-V, Safaryan M, Kuznedelev D, Alistarh D-A. 2024. The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. 38th Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","chicago":"Wu, Diyuan, Ionut-Vlad Modoranu, Mher Safaryan, Denis Kuznedelev, and Dan-Adrian Alistarh. “The Iterative Optimal Brain Surgeon: Faster Sparse Recovery by Leveraging Second-Order Information.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","ieee":"D. Wu, I.-V. Modoranu, M. Safaryan, D. Kuznedelev, and D.-A. Alistarh, “The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information,” in <i>38th Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","short":"D. Wu, I.-V. Modoranu, M. Safaryan, D. Kuznedelev, D.-A. Alistarh, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","mla":"Wu, Diyuan, et al. “The Iterative Optimal Brain Surgeon: Faster Sparse Recovery by Leveraging Second-Order Information.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024."},"type":"conference","external_id":{"arxiv":["2408.17163"]},"month":"12","quality_controlled":"1","publication_identifier":{"issn":["1049-5258"]},"OA_place":"repository","intvolume":"        37","conference":{"end_date":"2024-12-15","start_date":"2024-12-09","name":"NeurIPS: Neural Information Processing Systems","location":"Vancouver, Canada"},"volume":37,"publisher":"Neural Information Processing Systems Foundation","oa":1},{"file_date_updated":"2025-04-07T09:17:10Z","OA_type":"gold","language":[{"iso":"eng"}],"year":"2024","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","department":[{"_id":"DaAl"}],"date_created":"2025-04-06T22:01:32Z","author":[{"full_name":"Malinovskii, Vladimir","last_name":"Malinovskii","first_name":"Vladimir"},{"full_name":"Mazur, Denis","last_name":"Mazur","first_name":"Denis"},{"first_name":"Ivan","full_name":"Ilin, Ivan","last_name":"Ilin"},{"full_name":"Kuznedelev, Denis","last_name":"Kuznedelev","first_name":"Denis"},{"last_name":"Burlachenko","full_name":"Burlachenko, Konstantin","first_name":"Konstantin"},{"full_name":"Yi, Kai","last_name":"Yi","first_name":"Kai"},{"last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X"},{"full_name":"Richtarik, Peter","last_name":"Richtarik","first_name":"Peter"}],"scopus_import":"1","acknowledgement":"Authors would like to thank Vage Egiazarian, Andrei Panferov and Ruslan Svirschevski for their\r\nhelp and advice on AQLM codebase and running large-scale experiments. We also thank Philip\r\nZmushko and Artem Fedorov for helpful discussions during the early stages of our research. The research of Kai Yi, Konstantin Burlachenko, and Peter Richtárik reported in this publication was supported by funding from King Abdullah University of Science and Technology (KAUST) – Center of Excellence for Generative AI, under award number 5940. We would also like to thank our NeurIPS reviewers for their helpful suggestions, we specifically highlight p3Lv’s suggestions to consider smaller codebook sizes and evaluate PV-Tuning with QuIP#, both of which produced interesting findings. Finally, we thank the open-source contributors from llama.cpp9 and the LocalLlama10 community for discussions and inspirations on practical use cases of quantized language models, and in particular, Yalda Shabanzadeh and Arthur Aardvark for their help with improving the codebase.","has_accepted_license":"1","_id":"19519","title":"PV-tuning: Beyond straight-through estimation for extreme LLM compression","ddc":["000"],"abstract":[{"lang":"eng","text":"There has been significant interest in \"extreme\" compression of large language models (LLMs), i.e. to 1-2 bits per parameter, which allows such models to be executed efficiently on resource-constrained devices. Existing work focused on improved one-shot quantization techniques and weight representations; yet, purely post-training approaches are reaching diminishing returns in terms of the accuracy-vs-bit-width trade-off. State-of-the-art quantization methods such as QuIP# and AQLM include fine-tuning (part of) the compressed parameters over a limited amount of calibration data; however, such fine-tuning techniques over compressed weights often make exclusive use of straight-through estimators (STE), whose performance is not well-understood in this setting. In this work, we question the use of STE for extreme LLM compression, showing that it can be sub-optimal, and perform a systematic study of quantization-aware fine-tuning strategies for LLMs.We propose PV-Tuning - a representation-agnostic framework that generalizes and improves upon existing fine-tuning strategies, and provides convergence guarantees in restricted cases.On the practical side, when used for 1-2 bit vector quantization, PV-Tuning outperforms prior techniques for highly-performant models such as Llama and Mistral. Using PV-Tuning, we achieve the first Pareto-optimal quantization for Llama-2 family models at 2 bits per parameter."}],"arxiv":1,"publication":"38th Conference on Neural Information Processing Systems","status":"public","publication_status":"published","article_processing_charge":"No","date_published":"2024-12-20T00:00:00Z","oa_version":"Published Version","date_updated":"2025-05-14T10:49:20Z","alternative_title":["Advances in Neural Information Processing Systems"],"day":"20","publication_identifier":{"issn":["1049-5258"],"isbn":["9798331314385"]},"OA_place":"publisher","conference":{"location":"Vancouver, Canada","end_date":"2024-12-15","name":"NeurIPS: Neural Information Processing Systems","start_date":"2024-12-10"},"intvolume":"        37","volume":37,"publisher":"Neural Information Processing Systems Foundation","oa":1,"month":"12","type":"conference","external_id":{"arxiv":["2405.14852"]},"quality_controlled":"1","citation":{"chicago":"Malinovskii, Vladimir, Denis Mazur, Ivan Ilin, Denis Kuznedelev, Konstantin Burlachenko, Kai Yi, Dan-Adrian Alistarh, and Peter Richtarik. “PV-Tuning: Beyond Straight-through Estimation for Extreme LLM Compression.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","ista":"Malinovskii V, Mazur D, Ilin I, Kuznedelev D, Burlachenko K, Yi K, Alistarh D-A, Richtarik P. 2024. PV-tuning: Beyond straight-through estimation for extreme LLM compression. 38th Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","ieee":"V. Malinovskii <i>et al.</i>, “PV-tuning: Beyond straight-through estimation for extreme LLM compression,” in <i>38th Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","short":"V. Malinovskii, D. Mazur, I. Ilin, D. Kuznedelev, K. Burlachenko, K. Yi, D.-A. Alistarh, P. Richtarik, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","mla":"Malinovskii, Vladimir, et al. “PV-Tuning: Beyond Straight-through Estimation for Extreme LLM Compression.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024.","ama":"Malinovskii V, Mazur D, Ilin I, et al. PV-tuning: Beyond straight-through estimation for extreme LLM compression. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","apa":"Malinovskii, V., Mazur, D., Ilin, I., Kuznedelev, D., Burlachenko, K., Yi, K., … Richtarik, P. (2024). PV-tuning: Beyond straight-through estimation for extreme LLM compression. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation."},"file":[{"success":1,"relation":"main_file","access_level":"open_access","file_size":939712,"date_updated":"2025-04-07T09:17:10Z","file_id":"19521","file_name":"2024_NeurIPS_Malinovskii.pdf","checksum":"54d36f947887e26d0e568b512167001a","content_type":"application/pdf","date_created":"2025-04-07T09:17:10Z","creator":"dernst"}]},{"type":"research_data_reference","department":[{"_id":"DaAl"}],"date_created":"2025-06-24T06:09:18Z","month":"11","author":[{"id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","last_name":"Frantar","full_name":"Frantar, Elias","first_name":"Elias"},{"last_name":"Castro","full_name":"Castro, Roberto","first_name":"Roberto"},{"last_name":"Chen","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785","full_name":"Chen, Jiale","first_name":"Jiale","orcid":"0000-0001-5337-5875"},{"first_name":"Torsten","full_name":"Hoefler, Torsten","last_name":"Hoefler"},{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian"}],"OA_place":"repository","publisher":"Zenodo","tmp":{"short":"CC BY (4.0)","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode"},"oa":1,"doi":"10.5281/ZENODO.14213091","has_accepted_license":"1","_id":"19884","citation":{"apa":"Frantar, E., Castro, R., Chen, J., Hoefler, T., &#38; Alistarh, D.-A. (2024). MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. Zenodo. <a href=\"https://doi.org/10.5281/ZENODO.14213091\">https://doi.org/10.5281/ZENODO.14213091</a>","ama":"Frantar E, Castro R, Chen J, Hoefler T, Alistarh D-A. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. 2024. doi:<a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>","short":"E. Frantar, R. Castro, J. Chen, T. Hoefler, D.-A. Alistarh, (2024).","mla":"Frantar, Elias, et al. <i>MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models</i>. Zenodo, 2024, doi:<a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>.","ista":"Frantar E, Castro R, Chen J, Hoefler T, Alistarh D-A. 2024. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models, Zenodo, <a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>.","chicago":"Frantar, Elias, Roberto Castro, Jiale Chen, Torsten Hoefler, and Dan-Adrian Alistarh. “MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models.” Zenodo, 2024. <a href=\"https://doi.org/10.5281/ZENODO.14213091\">https://doi.org/10.5281/ZENODO.14213091</a>.","ieee":"E. Frantar, R. Castro, J. Chen, T. Hoefler, and D.-A. Alistarh, “MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models.” Zenodo, 2024."},"main_file_link":[{"open_access":"1","url":"https://doi.org/10.5281/ZENODO.14213091"}],"abstract":[{"lang":"eng","text":"This is Marlin, a Mixed Auto-Regressive Linear kernel (and the name of one of the planet's fastest fish), an extremely optimized FP16xINT4 matmul kernel aimed at LLM inference that can deliver close to ideal (4x) speedups up to batchsizes of 16-32 tokens (in contrast to the 1-2 tokens of prior work with comparable speedup).\r\n\r\nAdditionally, it includes Sparse-Marlin, an extension of the MARLIN kernels adding support to 2:4 weight sparsity, achieving 5.3x speedups on NVIDIA GPUs (Ampere/Ada)."}],"ddc":["000"],"corr_author":"1","title":"MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models","article_processing_charge":"No","related_material":{"record":[{"relation":"used_for_analysis_in","id":"19877","status":"public"}]},"year":"2024","date_published":"2024-11-24T00:00:00Z","day":"24","date_updated":"2025-09-30T13:41:56Z","license":"https://creativecommons.org/licenses/by/4.0/","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","oa_version":"Published Version","status":"public"},{"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2024","corr_author":"1","file_date_updated":"2024-07-29T07:37:31Z","language":[{"iso":"eng"}],"acknowledgement":"This work was supported in part by the ERC-2020-CoG 863818 (FoRM-SMArt) grant. We thank James Aspnes and Thomas Sauerwald for several helpful discussions on Ehrenfest random walks.","scopus_import":"1","_id":"17329","doi":"10.1145/3662158.3662768","has_accepted_license":"1","author":[{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian"},{"first_name":"Krishnendu","orcid":"0000-0002-4561-241X","last_name":"Chatterjee","id":"2E5DCA20-F248-11E8-B48F-1D18A9856A87","full_name":"Chatterjee, Krishnendu"},{"first_name":"Mehrdad","full_name":"Karrabi, Mehrdad","last_name":"Karrabi","id":"67638922-f394-11eb-9cf6-f20423e08757"},{"first_name":"John M","last_name":"Lazarsfeld","id":"17ce3656-183e-11ef-84c3-8932383e1b23","full_name":"Lazarsfeld, John M"}],"date_created":"2024-07-28T22:01:10Z","department":[{"_id":"DaAl"},{"_id":"KrCh"}],"publication_status":"published","status":"public","publication":"Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing","oa_version":"Published Version","date_updated":"2025-04-14T07:52:47Z","day":"17","article_processing_charge":"Yes (via OA deal)","date_published":"2024-06-17T00:00:00Z","title":"Game dynamics and equilibrium computation in the population protocol model","ddc":["000"],"ec_funded":1,"abstract":[{"text":"We initiate the study of game dynamics in the population protocol model: n agents each maintain a current local strategy and interact in pairs uniformly at random. Upon each interaction, the agents play a two-person game and receive a payoff from an underlying utility function, and they can subsequently update their strategies according to a fixed local algorithm. In this setting, we ask how the distribution over agent strategies evolves over a sequence of interactions, and we introduce a new distributional equilibrium concept to quantify the quality of such distributions. As an initial example, we study a class of repeated prisoner's dilemma games, and we consider a family of simple local update algorithms that yield non-trivial dynamics over the distribution of agent strategies. We show that these dynamics are related to a new class of high-dimensional Ehrenfest random walks, and we derive exact characterizations of their stationary distributions, bounds on their mixing times, and prove their convergence to approximate distributional equilibria. Our results highlight trade-offs between the local state space of each agent, and the convergence rate and approximation factor of the underlying dynamics. Our approach opens the door towards the further characterization of equilibrium computation for other classes of games and dynamics in the population setting.","lang":"eng"}],"citation":{"ama":"Alistarh D-A, Chatterjee K, Karrabi M, Lazarsfeld JM. Game dynamics and equilibrium computation in the population protocol model. In: <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i>. Association for Computing Machinery; 2024:40-49. doi:<a href=\"https://doi.org/10.1145/3662158.3662768\">10.1145/3662158.3662768</a>","apa":"Alistarh, D.-A., Chatterjee, K., Karrabi, M., &#38; Lazarsfeld, J. M. (2024). Game dynamics and equilibrium computation in the population protocol model. In <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i> (pp. 40–49). Nantes, France: Association for Computing Machinery. <a href=\"https://doi.org/10.1145/3662158.3662768\">https://doi.org/10.1145/3662158.3662768</a>","ieee":"D.-A. Alistarh, K. Chatterjee, M. Karrabi, and J. M. Lazarsfeld, “Game dynamics and equilibrium computation in the population protocol model,” in <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i>, Nantes, France, 2024, pp. 40–49.","chicago":"Alistarh, Dan-Adrian, Krishnendu Chatterjee, Mehrdad Karrabi, and John M Lazarsfeld. “Game Dynamics and Equilibrium Computation in the Population Protocol Model.” In <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i>, 40–49. Association for Computing Machinery, 2024. <a href=\"https://doi.org/10.1145/3662158.3662768\">https://doi.org/10.1145/3662158.3662768</a>.","ista":"Alistarh D-A, Chatterjee K, Karrabi M, Lazarsfeld JM. 2024. Game dynamics and equilibrium computation in the population protocol model. Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing. PODC: Symposium on Principles of Distributed Computing, 40–49.","mla":"Alistarh, Dan-Adrian, et al. “Game Dynamics and Equilibrium Computation in the Population Protocol Model.” <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i>, Association for Computing Machinery, 2024, pp. 40–49, doi:<a href=\"https://doi.org/10.1145/3662158.3662768\">10.1145/3662158.3662768</a>.","short":"D.-A. Alistarh, K. Chatterjee, M. Karrabi, J.M. Lazarsfeld, in:, Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing, Association for Computing Machinery, 2024, pp. 40–49."},"file":[{"file_name":"2024_ACMPODC_Alistarh.pdf","checksum":"65a40437f83373fa79dd999d5287509e","file_id":"17335","creator":"dernst","content_type":"application/pdf","date_created":"2024-07-29T07:37:31Z","relation":"main_file","success":1,"date_updated":"2024-07-29T07:37:31Z","access_level":"open_access","file_size":750908}],"project":[{"grant_number":"863818","_id":"0599E47C-7A3F-11EA-A408-12923DDC885E","call_identifier":"H2020","name":"Formal Methods for Stochastic Models: Algorithms and Applications"}],"publisher":"Association for Computing Machinery","tmp":{"short":"CC BY (4.0)","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode"},"oa":1,"publication_identifier":{"isbn":["9798400706684"]},"conference":{"name":"PODC: Symposium on Principles of Distributed Computing","start_date":"2024-06-17","end_date":"2024-06-21","location":"Nantes, France"},"quality_controlled":"1","page":"40-49","type":"conference","month":"06"},{"scopus_import":"1","acknowledgement":"We thank Trevor Brown and Yuanhao Wei for the discussion and anonymous reviewers for helping us to improve the paper. Also, we thank JetBrains and Huawei for their support.","doi":"10.1109/IPDPS57955.2024.00023","_id":"17332","date_created":"2024-07-28T22:01:11Z","department":[{"_id":"DaAl"}],"author":[{"first_name":"Ilya","full_name":"Kokorin, Ilya","last_name":"Kokorin"},{"first_name":"Victor","full_name":"Yudov, Victor","last_name":"Yudov"},{"full_name":"Aksenov, Vitaly","last_name":"Aksenov","first_name":"Vitaly"},{"orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"year":"2024","isi":1,"user_id":"317138e5-6ab7-11ef-aa6d-ffef3953e345","corr_author":"1","language":[{"iso":"eng"}],"main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2310.05293","open_access":"1"}],"citation":{"short":"I. Kokorin, V. Yudov, V. Aksenov, D.-A. Alistarh, in:, 2024 IEEE International Parallel and Distributed Processing Symposium, IEEE, 2024, pp. 169–179.","mla":"Kokorin, Ilya, et al. “Wait-Free Trees with Asymptotically-Efficient Range Queries.” <i>2024 IEEE International Parallel and Distributed Processing Symposium</i>, IEEE, 2024, pp. 169–79, doi:<a href=\"https://doi.org/10.1109/IPDPS57955.2024.00023\">10.1109/IPDPS57955.2024.00023</a>.","ista":"Kokorin I, Yudov V, Aksenov V, Alistarh D-A. 2024. Wait-free trees with asymptotically-efficient range queries. 2024 IEEE International Parallel and Distributed Processing Symposium. IPDPS: International Parallel and Distributed Processing Symposium, 169–179.","chicago":"Kokorin, Ilya, Victor Yudov, Vitaly Aksenov, and Dan-Adrian Alistarh. “Wait-Free Trees with Asymptotically-Efficient Range Queries.” In <i>2024 IEEE International Parallel and Distributed Processing Symposium</i>, 169–79. IEEE, 2024. <a href=\"https://doi.org/10.1109/IPDPS57955.2024.00023\">https://doi.org/10.1109/IPDPS57955.2024.00023</a>.","ieee":"I. Kokorin, V. Yudov, V. Aksenov, and D.-A. Alistarh, “Wait-free trees with asymptotically-efficient range queries,” in <i>2024 IEEE International Parallel and Distributed Processing Symposium</i>, San Francisco, CA, United States, 2024, pp. 169–179.","apa":"Kokorin, I., Yudov, V., Aksenov, V., &#38; Alistarh, D.-A. (2024). Wait-free trees with asymptotically-efficient range queries. In <i>2024 IEEE International Parallel and Distributed Processing Symposium</i> (pp. 169–179). San Francisco, CA, United States: IEEE. <a href=\"https://doi.org/10.1109/IPDPS57955.2024.00023\">https://doi.org/10.1109/IPDPS57955.2024.00023</a>","ama":"Kokorin I, Yudov V, Aksenov V, Alistarh D-A. Wait-free trees with asymptotically-efficient range queries. In: <i>2024 IEEE International Parallel and Distributed Processing Symposium</i>. IEEE; 2024:169-179. doi:<a href=\"https://doi.org/10.1109/IPDPS57955.2024.00023\">10.1109/IPDPS57955.2024.00023</a>"},"conference":{"location":"San Francisco, CA, United States","name":"IPDPS: International Parallel and Distributed Processing Symposium","end_date":"2024-05-31","start_date":"2024-05-27"},"publication_identifier":{"eissn":["1530-2075"],"isbn":["9798350337662"]},"publisher":"IEEE","oa":1,"external_id":{"isi":["001270389600078"],"arxiv":["2310.05293"]},"type":"conference","month":"07","page":"169-179","quality_controlled":"1","publication_status":"published","publication":"2024 IEEE International Parallel and Distributed Processing Symposium","status":"public","article_processing_charge":"No","date_published":"2024-07-08T00:00:00Z","date_updated":"2025-09-08T08:29:45Z","day":"08","oa_version":"Preprint","title":"Wait-free trees with asymptotically-efficient range queries","arxiv":1,"abstract":[{"lang":"eng","text":"Tree data structures, such as red-black trees, quad trees, treaps, or tries, are fundamental tools in computer science. A classical problem in concurrency is to obtain expressive, efficient, and scalable versions of practical tree data structures. We are interested in concurrent trees supporting range queries, i.e., queries that involve multiple consecutive data items. Existing implementations with this capability can list keys in a specific range, but do not support aggregate range queries: for instance, if we want to calculate the number of keys in a range, the only choice is to retrieve a whole list and return its size. This is suboptimal: in the sequential setting, one can augment a balanced search tree with counters and, consequently, perform these aggregate requests in logarithmic rather than linear time.In this paper, we propose a generic approach to implement a broad class of range queries on concurrent trees in a way that is wait-free, asymptotically efficient, and practically scalable. The key idea is a new mechanism for maintaining metadata concurrently at tree nodes, which can be seen as a wait-free variant of hand-over-hand locking (which we call hand-over-hand helping). We did a preliminary implementation of the wait-free binary search tree and preliminary experiments have indicated the soundness of our approach."}]},{"date_published":"2024-04-01T00:00:00Z","article_processing_charge":"No","day":"01","date_updated":"2026-06-18T17:55:24Z","oa_version":"Published Version","status":"public","publication_status":"published","publication":"Proceedings of Machine Learning and Systems ","arxiv":1,"abstract":[{"text":"Data-parallel distributed training of deep neural networks (DNN) has gained very widespread adoption, but can still experience communication bottlenecks. To address this issue, entire families of compression mechanisms have been developed, including quantization, sparsification, and low-rank approximation, some of which are seeing significant practical adoption. Despite this progress, almost all known compression schemes apply compression uniformly across DNN layers, although layers are heterogeneous in terms of parameter count and their impact on model accuracy.In this work, we provide a general framework for adapting the degree of compression across the model's layers dynamically during training, improving the overall compression, while leading to substantial speedups, without sacrificing accuracy. Our framework, called L-GreCo, is based on an adaptive algorithm, which automatically picks the optimal compression parameters for model layers guaranteeing the best compression ratio while satisfying an error constraint. Extensive experiments over image classification and language modeling tasks shows that L-GreCo is effective across all existing families of compression methods, and achieves up to 2.5\r\n×\r\n training speedup and up to 5\r\n×\r\n compression improvement over efficient implementations of existing approaches, while recovering full accuracy. Moreover, L-GreCo is complementary to existing adaptive algorithms, improving their compression ratio by 50\\% and practical throughput by 66\\%. An anonymized implementation is available at https://github.com/LGrCo/L-GreCo.","lang":"eng"}],"ddc":["000"],"title":"L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning","citation":{"apa":"Markov, I., Alimohammadi, K., Frantar, E., &#38; Alistarh, D.-A. (2024). L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning. In P. Gibbons, G. Pekhimenko, &#38; C. De Sa (Eds.), <i>Proceedings of Machine Learning and Systems </i> (Vol. 6). Athens, Greece: Association for Computing Machinery.","ama":"Markov I, Alimohammadi K, Frantar E, Alistarh D-A. L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning. In: Gibbons P, Pekhimenko G, De Sa C, eds. <i>Proceedings of Machine Learning and Systems </i>. Vol 6. Association for Computing Machinery; 2024.","short":"I. Markov, K. Alimohammadi, E. Frantar, D.-A. Alistarh, in:, P. Gibbons, G. Pekhimenko, C. De Sa (Eds.), Proceedings of Machine Learning and Systems , Association for Computing Machinery, 2024.","mla":"Markov, Ilia, et al. “L-GreCo: Layerwise-Adaptive Gradient Compression for Efficient Data-Parallel Deep Learning.” <i>Proceedings of Machine Learning and Systems </i>, edited by P. Gibbons et al., vol. 6, Association for Computing Machinery, 2024.","chicago":"Markov, Ilia, Kaveh Alimohammadi, Elias Frantar, and Dan-Adrian Alistarh. “L-GreCo: Layerwise-Adaptive Gradient Compression for Efficient Data-Parallel Deep Learning.” In <i>Proceedings of Machine Learning and Systems </i>, edited by P. Gibbons, G. Pekhimenko, and C. De Sa, Vol. 6. Association for Computing Machinery, 2024.","ista":"Markov I, Alimohammadi K, Frantar E, Alistarh D-A. 2024. L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning. Proceedings of Machine Learning and Systems . MLSys: Machine Learning and Systems vol. 6.","ieee":"I. Markov, K. Alimohammadi, E. Frantar, and D.-A. Alistarh, “L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning,” in <i>Proceedings of Machine Learning and Systems </i>, Athens, Greece, 2024, vol. 6."},"type":"conference","external_id":{"arxiv":["2210.17357"]},"month":"04","quality_controlled":"1","conference":{"end_date":"2024-04-22","start_date":"2024-04-22","name":"MLSys: Machine Learning and Systems","location":"Athens, Greece"},"intvolume":"         6","volume":6,"oa":1,"publisher":"Association for Computing Machinery","year":"2024","related_material":{"record":[{"id":"17490","relation":"dissertation_contains","status":"public"}]},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","language":[{"iso":"eng"}],"main_file_link":[{"open_access":"1","url":"https://proceedings.mlsys.org/paper_files/paper/2024/hash/9069a8976ff06f6443e7f4172990a580-Abstract-Conference.html"}],"editor":[{"full_name":"Gibbons, P.","last_name":"Gibbons","first_name":"P."},{"first_name":"G.","last_name":"Pekhimenko","full_name":"Pekhimenko, G."},{"first_name":"C.","full_name":"De Sa, C.","last_name":"De Sa"}],"corr_author":"1","_id":"17456","department":[{"_id":"DaAl"}],"date_created":"2024-08-22T08:29:25Z","author":[{"first_name":"Ilia","last_name":"Markov","id":"D0CF4148-C985-11E9-8066-0BDEE5697425","full_name":"Markov, Ilia"},{"last_name":"Alimohammadi","full_name":"Alimohammadi, Kaveh","first_name":"Kaveh"},{"full_name":"Frantar, Elias","last_name":"Frantar","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","first_name":"Elias"},{"orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}]},{"_id":"17490","has_accepted_license":"1","doi":"10.15479/at:ista:17490","author":[{"full_name":"Markov, Ilia","id":"D0CF4148-C985-11E9-8066-0BDEE5697425","last_name":"Markov","first_name":"Ilia"}],"department":[{"_id":"GradSch"},{"_id":"DaAl"}],"date_created":"2024-09-04T08:51:11Z","degree_awarded":"PhD","license":"https://creativecommons.org/licenses/by-nc-sa/4.0/","user_id":"ba8df636-2132-11f1-aed0-ed93e2281fdd","related_material":{"record":[{"relation":"part_of_dissertation","id":"14461","status":"public"},{"id":"12780","relation":"part_of_dissertation","status":"public"},{"id":"17456","relation":"part_of_dissertation","status":"public"}]},"year":"2024","supervisor":[{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian"}],"corr_author":"1","file_date_updated":"2024-09-04T08:36:06Z","language":[{"iso":"eng"}],"citation":{"ama":"Markov I. Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective. 2024. doi:<a href=\"https://doi.org/10.15479/at:ista:17490\">10.15479/at:ista:17490</a>","apa":"Markov, I. (2024). <i>Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/at:ista:17490\">https://doi.org/10.15479/at:ista:17490</a>","chicago":"Markov, Ilia. “Communication-Efficient Distributed Training of Deep Neural Networks : An Algorithms and Systems Perspective.” Institute of Science and Technology Austria, 2024. <a href=\"https://doi.org/10.15479/at:ista:17490\">https://doi.org/10.15479/at:ista:17490</a>.","ista":"Markov I. 2024. Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective. Institute of Science and Technology Austria.","ieee":"I. Markov, “Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective,” Institute of Science and Technology Austria, 2024.","short":"I. Markov, Communication-Efficient Distributed Training of Deep Neural Networks : An Algorithms and Systems Perspective, Institute of Science and Technology Austria, 2024.","mla":"Markov, Ilia. <i>Communication-Efficient Distributed Training of Deep Neural Networks : An Algorithms and Systems Perspective</i>. Institute of Science and Technology Austria, 2024, doi:<a href=\"https://doi.org/10.15479/at:ista:17490\">10.15479/at:ista:17490</a>."},"file":[{"relation":"source_file","date_updated":"2024-09-04T08:35:35Z","access_level":"closed","file_size":43327753,"file_id":"17491","checksum":"77609f4835d2730e46fa0d42d9134ed9","file_name":"Thesis.zip","date_created":"2024-09-04T08:35:35Z","content_type":"application/x-zip-compressed","creator":"imarkov"},{"creator":"imarkov","date_created":"2024-09-04T08:36:06Z","content_type":"application/pdf","checksum":"9e68f7217570f756ceb8f70b980938cd","file_name":"Thesis_final_version_pdfa2.pdf","file_id":"17492","file_size":2756082,"access_level":"open_access","date_updated":"2024-09-04T08:36:06Z","relation":"main_file","success":1}],"project":[{"call_identifier":"H2020","_id":"268A44D6-B435-11E9-9278-68D0E5697425","grant_number":"805223","name":"Elastic Coordination for Scalable Machine Learning"}],"oa":1,"tmp":{"short":"CC BY-NC-SA (4.0)","image":"/images/cc_by_nc_sa.png","name":"Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)","legal_code_url":"https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode"},"publisher":"Institute of Science and Technology Austria","publication_identifier":{"issn":["2663-337X"]},"OA_place":"publisher","page":"102","type":"dissertation","month":"09","publication_status":"published","status":"public","acknowledged_ssus":[{"_id":"ScienComp"}],"oa_version":"Published Version","date_updated":"2026-06-18T17:55:23Z","day":"04","alternative_title":["ISTA Thesis"],"article_processing_charge":"No","date_published":"2024-09-04T00:00:00Z","title":"Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective","ddc":["000"],"ec_funded":1,"abstract":[{"text":"Deep learning is essential in numerous applications nowadays, with many recent advancements made possible by training very large models. Despite their broad applicability, training neural networks is often time-intensive, and it is usually impractical to manage large models and datasets on a single machine. To address these issues, distributed deep learning training has become increasingly important. However, distributed training requires synchronization among nodes, and the mini-batch stochastic gradient descent algorithm places a significant load on network connections. A possible solution to tackle the synchronization bottleneck is to reduce a message size by lossy compression.\r\n\r\nIn this thesis, we investigate systems and algorithmic approaches to communication compression during training. From the systems perspective, we demonstrate that a common approach of expensive hardware overprovisioning can be replaced through a thorough system design. We introduce a framework that introduces efficient software support for compressed communication in machine learning applications, applicable to both multi-GPU single-node training and larger-scale multi-node training. Our framework integrates with popular ML frameworks, providing up to 3x speedups for multi-GPU nodes based on commodity hardware and order-of-magnitude improvements in the multi-node setting, with negligible impact on accuracy.\r\n\r\nAlso, we consider an application of our framework to different communication schemes, such as Fully Sharded Data Parallel. We provide strong convergence guarantees for the compression in such a setup. Empirical validation shows that our method preserves model accuracy for GPT-family models with up to 1.3 billion parameters, while completely removing the communication bottlenecks of non-compressed alternatives, providing up to 2.2x speedups end-to-end.\r\n\r\nFrom the algorithmic side, we propose a general framework that dynamically adjusts the degree of compression across a model's layers during training. This approach enhances overall compression and results in significant speedups without compromising accuracy. Our algorithm utilizes an adaptive algorithm that automatically selects the optimal compression parameters for model layers, ensuring the best compression ratio while adhering to an error constraint. Our method is effective across all existing families of compression methods. It achieves up to 2.5x faster training and up to a 5x improvement in compression compared to efficient implementations of current approaches. Additionally, LGreCo can complement existing adaptive algorithms.\r\n","lang":"eng"}]},{"language":[{"iso":"eng"}],"abstract":[{"text":"Parallel SGD in a shared-memory setting is oft-represented by the popular Hogwild! algorithm, in which lock-free updates are asynchronously performed by multiple computing processes. Unfortunately, scaling Hogwild! to distributed workers is largely unexplored. Specifically, it is unknown if any adaptation of Hogwild! to the popular decentralized multi-GPU setting offers any competitive speedup, either empirically or theoretically. In this work, we investigate the potential of decentralizing Hogwild! by incorporating simultaneously (a) asynchronous local gradient updates on the shared memory of GPUs, and (b) non-blocking asynchronous decentralized federated averaging. A naive direct implementation shows degradation in performance, arising from scheduling overheads and concurrent write conflicts on GPUs. To mitigate these drawbacks, we investigate and propose a new method, based on careful block selection rules, which update only portions of the parameter vectors. Our experiments show that the resulting decentralized training method exhibits improved throughput and competitive accuracy for standard image classification benchmarks on the CIFAR-10, CIFAR-100, and Imagenet datasets. On the theoretical side, we prove that our method guarantees sublinear ergodic convergence rates for non-convex objectives.","lang":"eng"}],"title":"Federated SGD with local asynchrony","corr_author":"1","article_processing_charge":"No","date_published":"2024-07-26T00:00:00Z","year":"2024","isi":1,"day":"26","date_updated":"2025-09-08T09:23:48Z","oa_version":"None","user_id":"317138e5-6ab7-11ef-aa6d-ffef3953e345","status":"public","publication_status":"published","publication":"Proceedings of the 44th International Conference on Distributed Computing Systems","department":[{"_id":"DaAl"}],"external_id":{"isi":["001304430200075"]},"date_created":"2024-09-15T22:01:41Z","type":"conference","month":"07","page":"857-868","author":[{"last_name":"Chatterjee","id":"3C41A08A-F248-11E8-B48F-1D18A9856A87","full_name":"Chatterjee, Bapi","first_name":"Bapi","orcid":"0000-0002-2742-4028"},{"first_name":"Vyacheslav","last_name":"Kungurtsev","full_name":"Kungurtsev, Vyacheslav"},{"orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"quality_controlled":"1","conference":{"name":"ICDCS: International Conference on Distributed Computing Systems","end_date":"2024-07-26","start_date":"2024-07-23","location":"Jersey City, NJ, United States"},"publication_identifier":{"issn":["1063-6927"],"isbn":["9798350386059"],"eissn":["2575-8411"]},"publisher":"IEEE","doi":"10.1109/ICDCS60910.2024.00084","_id":"18070","scopus_import":"1","citation":{"chicago":"Chatterjee, Bapi, Vyacheslav Kungurtsev, and Dan-Adrian Alistarh. “Federated SGD with Local Asynchrony.” In <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i>, 857–68. IEEE, 2024. <a href=\"https://doi.org/10.1109/ICDCS60910.2024.00084\">https://doi.org/10.1109/ICDCS60910.2024.00084</a>.","ista":"Chatterjee B, Kungurtsev V, Alistarh D-A. 2024. Federated SGD with local asynchrony. Proceedings of the 44th International Conference on Distributed Computing Systems. ICDCS: International Conference on Distributed Computing Systems, 857–868.","ieee":"B. Chatterjee, V. Kungurtsev, and D.-A. Alistarh, “Federated SGD with local asynchrony,” in <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i>, Jersey City, NJ, United States, 2024, pp. 857–868.","short":"B. Chatterjee, V. Kungurtsev, D.-A. Alistarh, in:, Proceedings of the 44th International Conference on Distributed Computing Systems, IEEE, 2024, pp. 857–868.","mla":"Chatterjee, Bapi, et al. “Federated SGD with Local Asynchrony.” <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i>, IEEE, 2024, pp. 857–68, doi:<a href=\"https://doi.org/10.1109/ICDCS60910.2024.00084\">10.1109/ICDCS60910.2024.00084</a>.","ama":"Chatterjee B, Kungurtsev V, Alistarh D-A. Federated SGD with local asynchrony. In: <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i>. IEEE; 2024:857-868. doi:<a href=\"https://doi.org/10.1109/ICDCS60910.2024.00084\">10.1109/ICDCS60910.2024.00084</a>","apa":"Chatterjee, B., Kungurtsev, V., &#38; Alistarh, D.-A. (2024). Federated SGD with local asynchrony. In <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i> (pp. 857–868). Jersey City, NJ, United States: IEEE. <a href=\"https://doi.org/10.1109/ICDCS60910.2024.00084\">https://doi.org/10.1109/ICDCS60910.2024.00084</a>"}},{"publication_status":"published","publication":"Proceedings of the 41st International Conference on Machine Learning","status":"public","alternative_title":["PMLR"],"date_updated":"2024-10-01T08:13:05Z","day":"01","oa_version":"Preprint","article_processing_charge":"No","date_published":"2024-09-01T00:00:00Z","title":"Extreme compression of large language models via additive quantization","arxiv":1,"abstract":[{"lang":"eng","text":"The emergence of accurate open large language models (LLMs) has led to a race towards performant quantization techniques which can enable their execution on end-user devices. In this paper, we revisit the problem of “extreme” LLM compression—defined as targeting extremely low bit counts, such as 2 to 3 bits per parameter—from the point of view of classic methods in Multi-Codebook Quantization (MCQ). Our algorithm, called AQLM, generalizes the classic Additive Quantization (AQ) approach for information retrieval to advance the state-of-the-art in LLM compression, via two innovations: 1) learned additive quantization of weight matrices in input-adaptive fashion, and 2) joint optimization of codebook parameters across each transformer blocks. Broadly, AQLM is the first scheme that is Pareto optimal in terms of accuracy-vs-model-size when compressing to less than 3 bits per parameter, and significantly improves upon all known schemes in the extreme compression (2bit) regime. In addition, AQLM is practical: we provide fast GPU and CPU implementations of AQLM for token generation, which enable us to match or outperform optimized FP16 implementations for speed, while executing in a much smaller memory footprint."}],"citation":{"mla":"Egiazarian, Vage, et al. “Extreme Compression of Large Language Models via Additive Quantization.” <i>Proceedings of the 41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 12284–303.","short":"V. Egiazarian, A. Panferov, D. Kuznedelev, E. Frantar, A. Babenko, D.-A. Alistarh, in:, Proceedings of the 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 12284–12303.","ieee":"V. Egiazarian, A. Panferov, D. Kuznedelev, E. Frantar, A. Babenko, and D.-A. Alistarh, “Extreme compression of large language models via additive quantization,” in <i>Proceedings of the 41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 12284–12303.","chicago":"Egiazarian, Vage, Andrei Panferov, Denis Kuznedelev, Elias Frantar, Artem Babenko, and Dan-Adrian Alistarh. “Extreme Compression of Large Language Models via Additive Quantization.” In <i>Proceedings of the 41st International Conference on Machine Learning</i>, 235:12284–303. ML Research Press, 2024.","ista":"Egiazarian V, Panferov A, Kuznedelev D, Frantar E, Babenko A, Alistarh D-A. 2024. Extreme compression of large language models via additive quantization. Proceedings of the 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 235, 12284–12303.","apa":"Egiazarian, V., Panferov, A., Kuznedelev, D., Frantar, E., Babenko, A., &#38; Alistarh, D.-A. (2024). Extreme compression of large language models via additive quantization. In <i>Proceedings of the 41st International Conference on Machine Learning</i> (Vol. 235, pp. 12284–12303). Vienna, Austria: ML Research Press.","ama":"Egiazarian V, Panferov A, Kuznedelev D, Frantar E, Babenko A, Alistarh D-A. Extreme compression of large language models via additive quantization. In: <i>Proceedings of the 41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:12284-12303."},"volume":235,"oa":1,"publisher":"ML Research Press","intvolume":"       235","conference":{"start_date":"2024-07-21","end_date":"2024-07-27","name":"ICML: International Conference on Machine Learning","location":"Vienna, Austria"},"publication_identifier":{"eissn":["2640-3498"]},"quality_controlled":"1","type":"conference","month":"09","external_id":{"arxiv":["2401.06118"]},"page":"12284-12303","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2024","corr_author":"1","main_file_link":[{"open_access":"1","url":" https://doi.org/10.48550/arXiv.2401.06118"}],"language":[{"iso":"eng"}],"acknowledgement":"Authors would like to thank Ruslan Svirschevski for his help in solving technical issues with AQLM and baselines. We also thank Tim Dettmers for helpful discussions on the structure of weights in modern LLMs and size-accuracy trade-offs. The authors would also like to thank Daniil Pavlov for his assistance with CPU benchmarking. Finally, authors would like to thank the communities of ML enthusiasts known as LocalLLaMA5 and Petals community on discord6\r\nfor the crowd wisdom about running LLMs on consumer devices. Egiazarian Vage and Denis Kuznedelev and Andrei Panferov were supported by the grant for research centers in the field of AI provided by the Analytical Center for the Government of the Russian Federation (ACRF) in\r\naccordance with the agreement on the provision of subsidies (identifier of the agreement 000000D730321P5Q0002) and the agreement with HSE University No. 70-2021-00139.","scopus_import":"1","_id":"18113","author":[{"last_name":"Egiazarian","full_name":"Egiazarian, Vage","first_name":"Vage"},{"first_name":"Andrei","last_name":"Panferov","id":"2c18daae-4dbe-11ef-8491-98ce2d960f09","full_name":"Panferov, Andrei"},{"first_name":"Denis","last_name":"Kuznedelev","full_name":"Kuznedelev, Denis"},{"first_name":"Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","last_name":"Frantar","full_name":"Frantar, Elias"},{"first_name":"Artem","last_name":"Babenko","full_name":"Babenko, Artem"},{"orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"date_created":"2024-09-22T22:01:43Z","department":[{"_id":"DaAl"},{"_id":"GradSch"}]},{"language":[{"iso":"eng"}],"main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2401.04679","open_access":"1"}],"corr_author":"1","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2024","related_material":{"link":[{"url":"https://github.com/IST-DASLab/RoSA","relation":"software"}]},"author":[{"first_name":"Mahdi","full_name":"Nikdan, Mahdi","last_name":"Nikdan","id":"66374281-f394-11eb-9cf6-869147deecc0"},{"first_name":"Soroush","orcid":"0009-0003-4119-6281","last_name":"Tabesh","id":"06000900-6068-11ef-8d61-c2472ef2e752","full_name":"Tabesh, Soroush"},{"first_name":"Elvir","last_name":"Crncevic","id":"41888001-440d-11ef-8299-d0e838b8185e","full_name":"Crncevic, Elvir"},{"orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"department":[{"_id":"DaAl"},{"_id":"GradSch"}],"date_created":"2024-09-22T22:01:44Z","_id":"18117","acknowledgement":"The authors would like to thank Eldar Kurtic for experimental support and useful suggestions throughout the project","scopus_import":"1","arxiv":1,"abstract":[{"lang":"eng","text":"We investigate parameter-efficient fine-tuning (PEFT) methods that can provide good accuracy under limited computational and memory budgets in the context of large language models (LLMs). We present a new PEFT method called Robust Adaptation (RoSA) inspired by robust principal component analysis that jointly trains low-rank\r\n and highly-sparse components on top of a set of fixed pretrained weights to efficiently approximate the performance of a full-fine-tuning (FFT) solution. Across a series of challenging generative tasks such as grade-school math and SQL query generation, which require fine-tuning for good performance, we show that RoSA outperforms LoRA, pure sparse fine-tuning, and alternative hybrid methods at the same parameter budget, and can even recover the performance of FFT on some tasks. We provide system support for RoSA to complement the training algorithm, specifically in the form of sparse GPU kernels which enable memory- and computationally-efficient training, and show that it is also compatible with low-precision base weights, resulting in the first joint representation combining quantization, low-rank and sparse approximations. Our code is available at https://github.com/IST-DASLab/RoSA."}],"title":"RoSA: Accurate parameter-efficient fine-tuning via robust adaptation","day":"01","date_updated":"2024-10-01T08:22:01Z","oa_version":"Preprint","date_published":"2024-09-01T00:00:00Z","article_processing_charge":"No","publication":"Proceedings of the 41st International Conference on Machine Learning","status":"public","publication_status":"published","quality_controlled":"1","external_id":{"arxiv":["2401.04679"]},"month":"09","type":"conference","page":"38187-38206","publisher":"ML Research Press","volume":235,"oa":1,"intvolume":"       235","conference":{"location":"Vienna, Austria","start_date":"2024-07-21","name":"ICML: International Conference on Machine Learning","end_date":"2024-07-27"},"publication_identifier":{"eissn":["2640-3498"]},"citation":{"short":"M. Nikdan, S. Tabesh, E. Crncevic, D.-A. Alistarh, in:, Proceedings of the 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 38187–38206.","mla":"Nikdan, Mahdi, et al. “RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation.” <i>Proceedings of the 41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 38187–206.","chicago":"Nikdan, Mahdi, Soroush Tabesh, Elvir Crncevic, and Dan-Adrian Alistarh. “RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation.” In <i>Proceedings of the 41st International Conference on Machine Learning</i>, 235:38187–206. ML Research Press, 2024.","ista":"Nikdan M, Tabesh S, Crncevic E, Alistarh D-A. 2024. RoSA: Accurate parameter-efficient fine-tuning via robust adaptation. Proceedings of the 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning vol. 235, 38187–38206.","ieee":"M. Nikdan, S. Tabesh, E. Crncevic, and D.-A. Alistarh, “RoSA: Accurate parameter-efficient fine-tuning via robust adaptation,” in <i>Proceedings of the 41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 38187–38206.","apa":"Nikdan, M., Tabesh, S., Crncevic, E., &#38; Alistarh, D.-A. (2024). RoSA: Accurate parameter-efficient fine-tuning via robust adaptation. In <i>Proceedings of the 41st International Conference on Machine Learning</i> (Vol. 235, pp. 38187–38206). Vienna, Austria: ML Research Press.","ama":"Nikdan M, Tabesh S, Crncevic E, Alistarh D-A. RoSA: Accurate parameter-efficient fine-tuning via robust adaptation. In: <i>Proceedings of the 41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:38187-38206."}},{"main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2310.04519"}],"language":[{"iso":"eng"}],"corr_author":"1","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","related_material":{"record":[{"status":"public","relation":"dissertation_contains","id":"21854"}],"link":[{"relation":"software","url":"https://github.com/IST-DASLab/SPADE"}]},"year":"2024","author":[{"first_name":"Arshia Soltani","last_name":"Moakhar","full_name":"Moakhar, Arshia Soltani"},{"orcid":"0000-0002-7778-3221","first_name":"Eugenia B","full_name":"Iofinova, Eugenia B","last_name":"Iofinova","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117"},{"last_name":"Frantar","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","full_name":"Frantar, Elias","first_name":"Elias"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X"}],"department":[{"_id":"DaAl"}],"date_created":"2024-09-22T22:01:46Z","_id":"18121","acknowledgement":"The authors would like to thank Stephen Casper and Tony Wang for their feedback on this work, and Eldar Kurtic for his advice on aspects of the project. This research was supported by the Scientific Service Units (SSU) of IST Austria through resources provided by Scientific Computing (SciComp). EI was supported in part by the FWF DK VGSCO, grant agreement number W1260-N35.","scopus_import":"1","abstract":[{"lang":"eng","text":"It is known that sparsity can improve interpretability for deep neural networks. However, existing methods in the area either require networks that are pre-trained with sparsity constraints, or impose sparsity after the fact, altering the network’s general behavior. In this paper, we demonstrate, for the first time, that sparsity can instead be incorporated into the interpretation process itself, as a sample-specific preprocessing step. Unlike previous work, this approach, which we call SPADE, does not place constraints on the trained model and does not affect its behavior during inference on the sample. Given a trained model and a target sample, SPADE uses sample-targeted pruning to provide a \"trace\" of the network’s execution on the sample, reducing the network to the most important connections prior to computing an interpretation. We demonstrate that preprocessing with SPADE significantly increases the accuracy of image saliency maps across several interpretability methods. Additionally, SPADE improves the usefulness of neuron visualizations, aiding humans in reasoning about network behavior. Our code is available at https://github.com/IST-DASLab/SPADE."}],"arxiv":1,"title":"SPADE: Sparsity-guided debugging for deep neural networks","acknowledged_ssus":[{"_id":"ScienComp"}],"oa_version":"Preprint","day":"01","alternative_title":["PMLR"],"date_updated":"2026-07-27T12:50:03Z","date_published":"2024-09-01T00:00:00Z","article_processing_charge":"No","publication":"Proceedings of the 41st International Conference on Machine Learning","status":"public","publication_status":"published","quality_controlled":"1","page":"45955-45987","external_id":{"arxiv":["2310.04519"]},"month":"09","type":"conference","publisher":"ML Research Press","oa":1,"volume":235,"publication_identifier":{"eissn":["2640-3498"]},"conference":{"location":"Vienna, Austria","start_date":"2024-07-21","end_date":"2024-07-27","name":"ICML: International Conference on Machine Learning"},"intvolume":"       235","project":[{"name":"Vienna Graduate School on Computational Optimization","_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","grant_number":"W1260-N35"}],"citation":{"apa":"Moakhar, A. S., Iofinova, E. B., Frantar, E., &#38; Alistarh, D.-A. (2024). SPADE: Sparsity-guided debugging for deep neural networks. In <i>Proceedings of the 41st International Conference on Machine Learning</i> (Vol. 235, pp. 45955–45987). Vienna, Austria: ML Research Press.","ama":"Moakhar AS, Iofinova EB, Frantar E, Alistarh D-A. SPADE: Sparsity-guided debugging for deep neural networks. In: <i>Proceedings of the 41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:45955-45987.","short":"A.S. Moakhar, E.B. Iofinova, E. Frantar, D.-A. Alistarh, in:, Proceedings of the 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 45955–45987.","mla":"Moakhar, Arshia Soltani, et al. “SPADE: Sparsity-Guided Debugging for Deep Neural Networks.” <i>Proceedings of the 41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 45955–87.","ista":"Moakhar AS, Iofinova EB, Frantar E, Alistarh D-A. 2024. SPADE: Sparsity-guided debugging for deep neural networks. Proceedings of the 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 235, 45955–45987.","chicago":"Moakhar, Arshia Soltani, Eugenia B Iofinova, Elias Frantar, and Dan-Adrian Alistarh. “SPADE: Sparsity-Guided Debugging for Deep Neural Networks.” In <i>Proceedings of the 41st International Conference on Machine Learning</i>, 235:45955–87. ML Research Press, 2024.","ieee":"A. S. Moakhar, E. B. Iofinova, E. Frantar, and D.-A. Alistarh, “SPADE: Sparsity-guided debugging for deep neural networks,” in <i>Proceedings of the 41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 45955–45987."}},{"ddc":["000"],"title":"Compressing large neural networks: Algorithms, systems and scaling laws","abstract":[{"lang":"eng","text":"Large language models (LLMs) have made tremendous progress in the past few years, from being able to generate coherent text to matching or surpassing humans in a wide variety of creative, knowledge or reasoning tasks. Much of this can be attributed to massively increased scale, both in the size of the model as well as the amount of training data, from 100s of millions to 100s of billions, or even trillions. This trend is expected to continue, which, although exciting, also raises major practical concerns. Already today's 100+ billion parameter LLMs require top-of-the-line hardware just to run. Hence, it is clear that sustaining these developments will require significant efficiency advances.\r\n\r\nHistorically, one of the most practical ways of improving model efficiency has been compression, especially in the form of sparsity or quantization. While this has been studied extensively in the past, existing accurate methods are all designed for models around 100 million parameters; scaling them up to ones literally 1000x larger is highly challenging. In this thesis, we introduce a new unified sparsification and quantization approach OBC, which through additional algorithmic enhancements leads to GPTQ and SparseGPT, the first techniques fast and accurate enough to compress 100+ billion parameter models to 4- or even 3-bit precision and 50% weight-sparsity, respectively. Additionally, we show how weight-only quantizion does not just bring space savings but also up to 4.5x faster generation speed, via custom GPU kernels.\r\n\r\nIn fact, we show for the first time that it is possible to develop an FP16 times INT4 mixed-precision matrix multiplication kernel, called Marlin, which comes close to simultaneously maximizing both memory and compute utilization, making weight-only quantization highly practical even for multi-user serving. Further, we demonstrate that GPTQ can be scaled to widely overparametrized trillion-parameter models, where extreme sub-1-bit compression rates can be achieved without any inference slow-down, by co-designing a bespoke entropy coding scheme together with an efficient kernel.\r\n\r\nFinally, we also study compression from the perspective of someone with access to massive amounts of compute resources for training large models completely from scratch. Here the key questions evolve around the joint scaling behavior between compression, model size, and amount of training data used. Based on extensive experimental results for both vision and text models, we introduce the first scaling law which accurately captures the relationship between weight-sparsity, number of non-zero weights and data. This further allows us to characterize the optimal sparsity, which we find to increase the longer a fixed cost model is being trained.\r\n\r\nOverall, this thesis presents contributions to three different angles of large model efficiency: affordable but accurate algorithms, highly efficient systems implementations, and fundamental scaling laws for compressed training."}],"ec_funded":1,"status":"public","publication_status":"published","doi_confirm":"1","date_published":"2024-09-05T00:00:00Z","article_processing_charge":"No","alternative_title":["ISTA Thesis"],"date_updated":"2026-07-29T13:48:40Z","day":"05","oa_version":"Published Version","acknowledged_ssus":[{"_id":"ScienComp"}],"OA_place":"publisher","publication_identifier":{"issn":["2663-337X"]},"oa":1,"publisher":"Institute of Science and Technology Austria","month":"09","type":"dissertation","page":"129","file":[{"file_size":1615167,"access_level":"closed","date_updated":"2024-09-05T12:04:11Z","relation":"source_file","creator":"efrantar","date_created":"2024-09-05T12:04:11Z","content_type":"application/zip","file_name":"thesis-final.zip","checksum":"5d785645805a78c5b4ce7cc3df557b09","file_id":"17570"},{"file_size":2376611,"access_level":"open_access","date_updated":"2024-09-06T16:24:59Z","relation":"main_file","success":1,"creator":"efrantar","content_type":"application/pdf","date_created":"2024-09-06T16:24:59Z","checksum":"a9dd1c2d23734986924eb44ebb55fd8f","file_name":"frantar_thesis_final.pdf","file_id":"17880"}],"citation":{"ista":"Frantar E. 2024. Compressing large neural networks: Algorithms, systems and scaling laws. Institute of Science and Technology Austria.","chicago":"Frantar, Elias. “Compressing Large Neural Networks: Algorithms, Systems and Scaling Laws.” Institute of Science and Technology Austria, 2024. <a href=\"https://doi.org/10.15479/at:ista:17485\">https://doi.org/10.15479/at:ista:17485</a>.","ieee":"E. Frantar, “Compressing large neural networks: Algorithms, systems and scaling laws,” Institute of Science and Technology Austria, 2024.","short":"E. Frantar, Compressing Large Neural Networks: Algorithms, Systems and Scaling Laws, Institute of Science and Technology Austria, 2024.","mla":"Frantar, Elias. <i>Compressing Large Neural Networks: Algorithms, Systems and Scaling Laws</i>. Institute of Science and Technology Austria, 2024, doi:<a href=\"https://doi.org/10.15479/at:ista:17485\">10.15479/at:ista:17485</a>.","ama":"Frantar E. Compressing large neural networks: Algorithms, systems and scaling laws. 2024. doi:<a href=\"https://doi.org/10.15479/at:ista:17485\">10.15479/at:ista:17485</a>","apa":"Frantar, E. (2024). <i>Compressing large neural networks: Algorithms, systems and scaling laws</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/at:ista:17485\">https://doi.org/10.15479/at:ista:17485</a>"},"project":[{"name":"Elastic Coordination for Scalable Machine Learning","_id":"268A44D6-B435-11E9-9278-68D0E5697425","call_identifier":"H2020","grant_number":"805223"}],"file_date_updated":"2024-09-06T16:24:59Z","corr_author":"1","language":[{"iso":"eng"}],"degree_awarded":"PhD","supervisor":[{"full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian"}],"year":"2024","related_material":{"record":[{"id":"17378","relation":"part_of_dissertation","status":"public"},{"id":"17087","relation":"part_of_dissertation","status":"public"},{"id":"14458","relation":"part_of_dissertation","status":"public"},{"id":"18062","relation":"part_of_dissertation","status":"public"},{"status":"public","id":"18061","relation":"part_of_dissertation"}]},"user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"date_created":"2024-09-02T11:01:48Z","author":[{"full_name":"Frantar, Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","last_name":"Frantar","first_name":"Elias"}],"has_accepted_license":"1","doi":"10.15479/at:ista:17485","_id":"17485"},{"oa":1,"conference":{"name":"ICLR: International Conference on Learning Representations","start_date":"2024-05-07","end_date":"2024-05-07","location":"Vienna, Austria"},"author":[{"id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","last_name":"Frantar","full_name":"Frantar, Elias","first_name":"Elias"},{"first_name":"Carlos Riquelme","last_name":"Ruiz","full_name":"Ruiz, Carlos Riquelme"},{"full_name":"Houlsby, Neil","last_name":"Houlsby","first_name":"Neil"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X"},{"full_name":"Evci, Utku","last_name":"Evci","first_name":"Utku"}],"quality_controlled":"1","department":[{"_id":"DaAl"}],"type":"conference","external_id":{"arxiv":["2309.08520"]},"date_created":"2024-09-13T10:31:08Z","month":"01","citation":{"apa":"Frantar, E., Ruiz, C. R., Houlsby, N., Alistarh, D.-A., &#38; Evci, U. (2024). Scaling laws for sparsely-connected foundation models. In <i>The Twelfth International Conference on Learning Representations</i>. Vienna, Austria.","ama":"Frantar E, Ruiz CR, Houlsby N, Alistarh D-A, Evci U. Scaling laws for sparsely-connected foundation models. In: <i>The Twelfth International Conference on Learning Representations</i>. ; 2024.","mla":"Frantar, Elias, et al. “Scaling Laws for Sparsely-Connected Foundation Models.” <i>The Twelfth International Conference on Learning Representations</i>, 2024.","short":"E. Frantar, C.R. Ruiz, N. Houlsby, D.-A. Alistarh, U. Evci, in:, The Twelfth International Conference on Learning Representations, 2024.","ieee":"E. Frantar, C. R. Ruiz, N. Houlsby, D.-A. Alistarh, and U. Evci, “Scaling laws for sparsely-connected foundation models,” in <i>The Twelfth International Conference on Learning Representations</i>, Vienna, Austria, 2024.","ista":"Frantar E, Ruiz CR, Houlsby N, Alistarh D-A, Evci U. 2024. Scaling laws for sparsely-connected foundation models. The Twelfth International Conference on Learning Representations. ICLR: International Conference on Learning Representations.","chicago":"Frantar, Elias, Carlos Riquelme Ruiz, Neil Houlsby, Dan-Adrian Alistarh, and Utku Evci. “Scaling Laws for Sparsely-Connected Foundation Models.” In <i>The Twelfth International Conference on Learning Representations</i>, 2024."},"scopus_import":"1","_id":"18062","ddc":["000"],"title":"Scaling laws for sparsely-connected foundation models","corr_author":"1","language":[{"iso":"eng"}],"main_file_link":[{"open_access":"1","url":"https://openreview.net/forum?id=i9K2ZWkYIP"}],"arxiv":1,"abstract":[{"text":"We explore the impact of parameter sparsity on the scaling behavior of Transformers trained on massive datasets (i.e., \"foundation models\"), in both vision and language domains. In this setting, we identify the first scaling law describing the relationship between weight sparsity, number of non-zero parameters, and amount of training data, which we validate empirically across model and data scales; on ViT/JFT-4B and T5/C4. These results allow us to characterize the \"optimal sparsity\", the sparsity level which yields the best performance for a given effective model size and training budget. For a fixed number of non-zero parameters, we identify that the optimal sparsity increases with the amount of data used for training. We also extend our study to different sparsity structures (such as the hardware-friendly n:m pattern) and strategies (such as starting from a pretrained dense model). Our findings shed light on the power and limitations of weight sparsity across various parameter and computational settings, offering both theoretical understanding and practical implications for leveraging sparsity towards computational efficiency improvements. We provide pruning and scaling law fitting code at: github.com/google-research/jaxpruner/tree/main/jaxpruner/projects/bigsparse.","lang":"eng"}],"publication_status":"published","publication":"The Twelfth International Conference on Learning Representations","status":"public","date_updated":"2026-07-29T13:48:40Z","day":"16","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","oa_version":"Published Version","year":"2024","article_processing_charge":"No","related_material":{"record":[{"status":"public","id":"17485","relation":"dissertation_contains"}]},"date_published":"2024-01-16T00:00:00Z"}]