[{"month":"09","date_updated":"2024-10-01T08:22:01Z","_id":"18117","corr_author":"1","title":"RoSA: Accurate parameter-efficient fine-tuning via robust adaptation","type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","conference":{"end_date":"2024-07-27","start_date":"2024-07-21","name":"ICML: International Conference on Machine Learning","location":"Vienna, Austria"},"volume":235,"scopus_import":"1","oa_version":"Preprint","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2401.04679","open_access":"1"}],"day":"01","publisher":"ML Research Press","oa":1,"citation":{"ama":"Nikdan M, Tabesh S, Crncevic E, Alistarh D-A. RoSA: Accurate parameter-efficient fine-tuning via robust adaptation. In: <i>Proceedings of the 41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:38187-38206.","short":"M. Nikdan, S. Tabesh, E. Crncevic, D.-A. Alistarh, in:, Proceedings of the 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 38187–38206.","chicago":"Nikdan, Mahdi, Soroush Tabesh, Elvir Crncevic, and Dan-Adrian Alistarh. “RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation.” In <i>Proceedings of the 41st International Conference on Machine Learning</i>, 235:38187–206. ML Research Press, 2024.","ista":"Nikdan M, Tabesh S, Crncevic E, Alistarh D-A. 2024. RoSA: Accurate parameter-efficient fine-tuning via robust adaptation. Proceedings of the 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning vol. 235, 38187–38206.","ieee":"M. Nikdan, S. Tabesh, E. Crncevic, and D.-A. Alistarh, “RoSA: Accurate parameter-efficient fine-tuning via robust adaptation,” in <i>Proceedings of the 41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 38187–38206.","apa":"Nikdan, M., Tabesh, S., Crncevic, E., &#38; Alistarh, D.-A. (2024). RoSA: Accurate parameter-efficient fine-tuning via robust adaptation. In <i>Proceedings of the 41st International Conference on Machine Learning</i> (Vol. 235, pp. 38187–38206). Vienna, Austria: ML Research Press.","mla":"Nikdan, Mahdi, et al. “RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation.” <i>Proceedings of the 41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 38187–206."},"intvolume":"       235","arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"},{"_id":"GradSch"}],"publication_identifier":{"eissn":["2640-3498"]},"language":[{"iso":"eng"}],"publication":"Proceedings of the 41st International Conference on Machine Learning","year":"2024","acknowledgement":"The authors would like to thank Eldar Kurtic for experimental support and useful suggestions throughout the project","quality_controlled":"1","author":[{"full_name":"Nikdan, Mahdi","id":"66374281-f394-11eb-9cf6-869147deecc0","first_name":"Mahdi","last_name":"Nikdan"},{"last_name":"Tabesh","first_name":"Soroush","id":"06000900-6068-11ef-8d61-c2472ef2e752","orcid":"0009-0003-4119-6281","full_name":"Tabesh, Soroush"},{"last_name":"Crncevic","full_name":"Crncevic, Elvir","id":"41888001-440d-11ef-8299-d0e838b8185e","first_name":"Elvir"},{"last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"abstract":[{"lang":"eng","text":"We investigate parameter-efficient fine-tuning (PEFT) methods that can provide good accuracy under limited computational and memory budgets in the context of large language models (LLMs). We present a new PEFT method called Robust Adaptation (RoSA) inspired by robust principal component analysis that jointly trains low-rank\r\n and highly-sparse components on top of a set of fixed pretrained weights to efficiently approximate the performance of a full-fine-tuning (FFT) solution. Across a series of challenging generative tasks such as grade-school math and SQL query generation, which require fine-tuning for good performance, we show that RoSA outperforms LoRA, pure sparse fine-tuning, and alternative hybrid methods at the same parameter budget, and can even recover the performance of FFT on some tasks. We provide system support for RoSA to complement the training algorithm, specifically in the form of sparse GPU kernels which enable memory- and computationally-efficient training, and show that it is also compatible with low-precision base weights, resulting in the first joint representation combining quantization, low-rank and sparse approximations. Our code is available at https://github.com/IST-DASLab/RoSA."}],"page":"38187-38206","article_processing_charge":"No","date_published":"2024-09-01T00:00:00Z","date_created":"2024-09-22T22:01:44Z","status":"public","related_material":{"link":[{"relation":"software","url":"https://github.com/IST-DASLab/RoSA"}]},"external_id":{"arxiv":["2401.04679"]}},{"publisher":"Zenodo","article_processing_charge":"No","date_created":"2025-06-24T06:09:18Z","date_published":"2024-11-24T00:00:00Z","oa":1,"OA_place":"repository","status":"public","citation":{"mla":"Frantar, Elias, et al. <i>MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models</i>. Zenodo, 2024, doi:<a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>.","ista":"Frantar E, Castro R, Chen J, Hoefler T, Alistarh D-A. 2024. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models, Zenodo, <a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>.","ieee":"E. Frantar, R. Castro, J. Chen, T. Hoefler, and D.-A. Alistarh, “MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models.” Zenodo, 2024.","apa":"Frantar, E., Castro, R., Chen, J., Hoefler, T., &#38; Alistarh, D.-A. (2024). MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. Zenodo. <a href=\"https://doi.org/10.5281/ZENODO.14213091\">https://doi.org/10.5281/ZENODO.14213091</a>","chicago":"Frantar, Elias, Roberto Castro, Jiale Chen, Torsten Hoefler, and Dan-Adrian Alistarh. “MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models.” Zenodo, 2024. <a href=\"https://doi.org/10.5281/ZENODO.14213091\">https://doi.org/10.5281/ZENODO.14213091</a>.","ama":"Frantar E, Castro R, Chen J, Hoefler T, Alistarh D-A. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. 2024. doi:<a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>","short":"E. Frantar, R. Castro, J. Chen, T. Hoefler, D.-A. Alistarh, (2024)."},"related_material":{"record":[{"id":"19877","status":"public","relation":"used_for_analysis_in"}]},"department":[{"_id":"DaAl"}],"license":"https://creativecommons.org/licenses/by/4.0/","abstract":[{"lang":"eng","text":"This is Marlin, a Mixed Auto-Regressive Linear kernel (and the name of one of the planet's fastest fish), an extremely optimized FP16xINT4 matmul kernel aimed at LLM inference that can deliver close to ideal (4x) speedups up to batchsizes of 16-32 tokens (in contrast to the 1-2 tokens of prior work with comparable speedup).\r\n\r\nAdditionally, it includes Sparse-Marlin, an extension of the MARLIN kernels adding support to 2:4 weight sparsity, achieving 5.3x speedups on NVIDIA GPUs (Ampere/Ada)."}],"has_accepted_license":"1","day":"24","oa_version":"Published Version","main_file_link":[{"url":"https://doi.org/10.5281/ZENODO.14213091","open_access":"1"}],"doi":"10.5281/ZENODO.14213091","type":"research_data_reference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"author":[{"full_name":"Frantar, Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","first_name":"Elias","last_name":"Frantar"},{"full_name":"Castro, Roberto","first_name":"Roberto","last_name":"Castro"},{"last_name":"Chen","orcid":"0000-0001-5337-5875","full_name":"Chen, Jiale","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785","first_name":"Jiale"},{"last_name":"Hoefler","first_name":"Torsten","full_name":"Hoefler, Torsten"},{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","last_name":"Alistarh"}],"month":"11","date_updated":"2025-09-30T13:41:56Z","_id":"19884","year":"2024","corr_author":"1","ddc":["000"],"title":"MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models"},{"oa":1,"citation":{"apa":"Modoranu, I.-V., Kalinov, A., Kurtic, E., Frantar, E., &#38; Alistarh, D.-A. (2024). Error feedback can accurately compress preconditioners. In <i>41st International Conference on Machine Learning</i> (Vol. 235, pp. 35910–35933). Vienna, Austria: ML Research Press.","ista":"Modoranu I-V, Kalinov A, Kurtic E, Frantar E, Alistarh D-A. 2024. Error feedback can accurately compress preconditioners. 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 235, 35910–35933.","ieee":"I.-V. Modoranu, A. Kalinov, E. Kurtic, E. Frantar, and D.-A. Alistarh, “Error feedback can accurately compress preconditioners,” in <i>41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 35910–35933.","mla":"Modoranu, Ionut-Vlad, et al. “Error Feedback Can Accurately Compress Preconditioners.” <i>41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 35910–33.","ama":"Modoranu I-V, Kalinov A, Kurtic E, Frantar E, Alistarh D-A. Error feedback can accurately compress preconditioners. In: <i>41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:35910-35933.","short":"I.-V. Modoranu, A. Kalinov, E. Kurtic, E. Frantar, D.-A. Alistarh, in:, 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 35910–35933.","chicago":"Modoranu, Ionut-Vlad, Aleksei Kalinov, Eldar Kurtic, Elias Frantar, and Dan-Adrian Alistarh. “Error Feedback Can Accurately Compress Preconditioners.” In <i>41st International Conference on Machine Learning</i>, 235:35910–33. ML Research Press, 2024."},"publisher":"ML Research Press","publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"}],"intvolume":"       235","volume":235,"scopus_import":"1","conference":{"end_date":"2024-07-27","start_date":"2024-07-21","name":"ICML: International Conference on Machine Learning","location":"Vienna, Austria"},"day":"30","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2306.06098","open_access":"1"}],"oa_version":"Preprint","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","type":"conference","date_updated":"2025-01-30T07:54:16Z","_id":"18975","month":"07","title":"Error feedback can accurately compress preconditioners","alternative_title":["PMLR"],"corr_author":"1","status":"public","OA_place":"repository","article_processing_charge":"No","date_published":"2024-07-30T00:00:00Z","date_created":"2025-01-30T07:53:22Z","external_id":{"arxiv":["2306.06098"]},"abstract":[{"text":"Leveraging second-order information about the loss at the scale of deep networks is one of the main lines of approach for improving the performance of current optimizers for deep learning. Yet, existing approaches for accurate full-matrix preconditioning, such as Full-Matrix Adagrad (GGT) or Matrix-Free Approximate Curvature (M-FAC) suffer from massive storage costs when applied even to small-scale models, as they must store a sliding window of gradients, whose memory requirements are multiplicative in the model dimension. In this paper, we address this issue via a novel and efficient error-feedback technique that can be applied to compress preconditioners by up to two orders of magnitude in practice, without loss of convergence. Specifically, our approach compresses the gradient information via sparsification or low-rank compression before it is fed into the preconditioner, feeding the compression error back into future iterations. Extensive experiments on deep neural networks show that this approach can compress full-matrix preconditioners to up to 99% sparsity without accuracy loss, effectively removing the memory overhead of fullmatrix preconditioners such as GGT and M-FAC.","lang":"eng"}],"OA_type":"green","page":"35910-35933","author":[{"last_name":"Modoranu","first_name":"Ionut-Vlad","id":"449f7a18-f128-11eb-9611-9b430c0c6333","full_name":"Modoranu, Ionut-Vlad"},{"last_name":"Kalinov","first_name":"Aleksei","id":"44b7120e-eb97-11eb-a6c2-e1557aa81d02","full_name":"Kalinov, Aleksei","orcid":"0000-0003-2189-3904"},{"full_name":"Kurtic, Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","first_name":"Eldar","last_name":"Kurtic"},{"last_name":"Frantar","full_name":"Frantar, Elias","first_name":"Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"quality_controlled":"1","acknowledged_ssus":[{"_id":"CampIT"}],"language":[{"iso":"eng"}],"year":"2024","publication":"41st International Conference on Machine Learning","publication_identifier":{"eissn":["2640-3498"]},"acknowledgement":"The authors thank Adrian Vladu, Razvan Pascanu, Alexandra Peste, Mher Safaryan for their valuable feedback, the IT department from Institute of Science and Technology Austria for the hardware support and Weights and Biases for the infrastructure to track all our experiments."},{"ec_funded":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","type":"conference","title":"AsGrad: A sharp unified analysis of asynchronous-SGD algorithms","corr_author":"1","alternative_title":["PMLR"],"_id":"18976","date_updated":"2025-04-14T07:54:52Z","month":"05","department":[{"_id":"DaAl"}],"publication_status":"published","arxiv":1,"intvolume":"       238","citation":{"mla":"Islamov, Rustem, et al. “AsGrad: A Sharp Unified Analysis of Asynchronous-SGD Algorithms.” <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i>, vol. 238, ML Research Press, 2024, pp. 649–57.","ista":"Islamov R, Safaryan M, Alistarh D-A. 2024. AsGrad: A sharp unified analysis of asynchronous-SGD algorithms. Proceedings of The 27th International Conference on Artificial Intelligence and Statistics. AISTATS: Conference on Artificial Intelligence and Statistics, PMLR, vol. 238, 649–657.","apa":"Islamov, R., Safaryan, M., &#38; Alistarh, D.-A. (2024). AsGrad: A sharp unified analysis of asynchronous-SGD algorithms. In <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i> (Vol. 238, pp. 649–657). Valencia, Spain: ML Research Press.","ieee":"R. Islamov, M. Safaryan, and D.-A. Alistarh, “AsGrad: A sharp unified analysis of asynchronous-SGD algorithms,” in <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i>, Valencia, Spain, 2024, vol. 238, pp. 649–657.","chicago":"Islamov, Rustem, Mher Safaryan, and Dan-Adrian Alistarh. “AsGrad: A Sharp Unified Analysis of Asynchronous-SGD Algorithms.” In <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i>, 238:649–57. ML Research Press, 2024.","ama":"Islamov R, Safaryan M, Alistarh D-A. AsGrad: A sharp unified analysis of asynchronous-SGD algorithms. In: <i>Proceedings of The 27th International Conference on Artificial Intelligence and Statistics</i>. Vol 238. ML Research Press; 2024:649-657.","short":"R. Islamov, M. Safaryan, D.-A. Alistarh, in:, Proceedings of The 27th International Conference on Artificial Intelligence and Statistics, ML Research Press, 2024, pp. 649–657."},"oa":1,"publisher":"ML Research Press","oa_version":"Preprint","day":"15","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2310.20452","open_access":"1"}],"volume":238,"scopus_import":"1","project":[{"grant_number":"101034413","name":"IST-BRIDGE: International postdoctoral program","call_identifier":"H2020","_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c"}],"conference":{"start_date":"2024-05-02","end_date":"2024-05-04","name":"AISTATS: Conference on Artificial Intelligence and Statistics","location":"Valencia, Spain"},"author":[{"full_name":"Islamov, Rustem","first_name":"Rustem","last_name":"Islamov"},{"last_name":"Safaryan","first_name":"Mher","id":"dd546b39-0804-11ed-9c55-ef075c39778d","full_name":"Safaryan, Mher"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"quality_controlled":"1","acknowledgement":"The authors thank all anonymous reviewers for their valuable comments and suggestions on how to improve the manuscript. This work was done when Rustem Islamov was a Master’s student at Institut Polytechnique de Paris (IP Paris) and an intern at Institute of Science and Technology Austria (ISTA). The research of Rustem Islamov was supported by ISTA internship\r\nprogram. Mher Safaryan has received funding from the European Union’s Horizon 2020 research and innovation program under the Marie Skłodowska-Curie grant agreement No 101034413.","year":"2024","language":[{"iso":"eng"}],"publication":"Proceedings of The 27th International Conference on Artificial Intelligence and Statistics","publication_identifier":{"eissn":["2640-3498"]},"external_id":{"arxiv":["2310.20452"]},"OA_place":"repository","status":"public","date_published":"2024-05-15T00:00:00Z","date_created":"2025-01-30T08:15:49Z","article_processing_charge":"No","OA_type":"green","page":"649-657","abstract":[{"lang":"eng","text":"We analyze asynchronous-type algorithms for distributed SGD in the heterogeneous setting, where each worker has its own computation and communication speeds, as well as data distribution. In these algorithms, workers compute possibly stale and stochastic gradients associated with their local data at some iteration back in history and then return those gradients to the server without synchronizing with other workers. We present a unified convergence theory for non-convex smooth functions in the heterogeneous regime. The proposed analysis provides convergence for pure asynchronous SGD and its various modifications. Moreover, our theory explains what affects the convergence rate and what can be done to improve the performance of asynchronous algorithms. In particular, we introduce a novel asynchronous method based on worker shuffling. As a by-product of our analysis, we also demonstrate convergence guarantees for gradient-type algorithms such as SGD with random reshuffling and shuffle-once mini-batch SGD. The derived rates match the best-known results for those algorithms, highlighting the tightness of our approach. Finally, our numerical evaluations support theoretical findings and show the good practical performance of our method."}]},{"type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","title":"SpQR: A sparse-quantized representation for near-lossless LLM weight compression","month":"05","_id":"18977","date_updated":"2025-01-30T08:27:47Z","department":[{"_id":"DaAl"}],"arxiv":1,"publication_status":"published","publisher":"OpenReview","citation":{"ista":"Dettmers T, Svirschevski RA, Egiazarian V, Kuznedelev D, Frantar E, Ashkboos S, Borzunov A, Hoefler T, Alistarh D-A. 2024. SpQR: A sparse-quantized representation for near-lossless LLM weight compression. 12th International Conference on Learning Representations. ICLR: International Conference on Learning Representations.","ieee":"T. Dettmers <i>et al.</i>, “SpQR: A sparse-quantized representation for near-lossless LLM weight compression,” in <i>12th International Conference on Learning Representations</i>, Vienna, Austria, 2024.","apa":"Dettmers, T., Svirschevski, R. A., Egiazarian, V., Kuznedelev, D., Frantar, E., Ashkboos, S., … Alistarh, D.-A. (2024). SpQR: A sparse-quantized representation for near-lossless LLM weight compression. In <i>12th International Conference on Learning Representations</i>. Vienna, Austria: OpenReview.","mla":"Dettmers, Tim, et al. “SpQR: A Sparse-Quantized Representation for near-Lossless LLM Weight Compression.” <i>12th International Conference on Learning Representations</i>, OpenReview, 2024.","ama":"Dettmers T, Svirschevski RA, Egiazarian V, et al. SpQR: A sparse-quantized representation for near-lossless LLM weight compression. In: <i>12th International Conference on Learning Representations</i>. OpenReview; 2024.","short":"T. Dettmers, R.A. Svirschevski, V. Egiazarian, D. Kuznedelev, E. Frantar, S. Ashkboos, A. Borzunov, T. Hoefler, D.-A. Alistarh, in:, 12th International Conference on Learning Representations, OpenReview, 2024.","chicago":"Dettmers, Tim, Ruslan A. Svirschevski, Vage Egiazarian, Denis Kuznedelev, Elias Frantar, Saleh Ashkboos, Alexander Borzunov, Torsten Hoefler, and Dan-Adrian Alistarh. “SpQR: A Sparse-Quantized Representation for near-Lossless LLM Weight Compression.” In <i>12th International Conference on Learning Representations</i>. OpenReview, 2024."},"oa":1,"day":"15","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2306.03078","open_access":"1"}],"oa_version":"Preprint","conference":{"end_date":"2024-05-11","start_date":"2024-05-07","name":"ICLR: International Conference on Learning Representations","location":"Vienna, Austria"},"scopus_import":"1","quality_controlled":"1","author":[{"first_name":"Tim","full_name":"Dettmers, Tim","last_name":"Dettmers"},{"last_name":"Svirschevski","first_name":"Ruslan A.","full_name":"Svirschevski, Ruslan A."},{"full_name":"Egiazarian, Vage","first_name":"Vage","last_name":"Egiazarian"},{"full_name":"Kuznedelev, Denis","first_name":"Denis","last_name":"Kuznedelev"},{"id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","first_name":"Elias","full_name":"Frantar, Elias","last_name":"Frantar"},{"last_name":"Ashkboos","full_name":"Ashkboos, Saleh","first_name":"Saleh"},{"first_name":"Alexander","full_name":"Borzunov, Alexander","last_name":"Borzunov"},{"first_name":"Torsten","full_name":"Hoefler, Torsten","last_name":"Hoefler"},{"last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"acknowledgement":"Denis Kuznedelev acknowledges the support from the Russian Ministry of Science and Higher\r\nEducation, grant No. 075-10-2021-068. Ruslan Svirschevski and Vage Egiazarian and Denis\r\nKuznedelev were supported by the grant for research centers in the field of AI provided by the\r\nAnalytical Center for the Government of the Russian Federation (ACRF) in accordance with the\r\nagreement on the provision of subsidies (identifier of the agreement 000000D730321P5Q0002) and the agreement with HSE University No. 70-2021-00139.","language":[{"iso":"eng"}],"year":"2024","publication":"12th International Conference on Learning Representations","external_id":{"arxiv":["2306.03078"]},"date_published":"2024-05-15T00:00:00Z","date_created":"2025-01-30T08:26:59Z","article_processing_charge":"No","OA_place":"repository","status":"public","OA_type":"green","abstract":[{"text":"Recent advances in large language model (LLM) pretraining have led to high-quality LLMs with impressive abilities. By compressing such LLMs via quantization to 3-4 bits per parameter, they can fit into memory-limited devices such as laptops and mobile phones, enabling personalized use. Quantizing models to 3-4 bits per parameter can lead to moderate to high accuracy losses, especially for smaller models (1-10B parameters), which are suitable for edge deployment. To address this accuracy issue, we introduce the Sparse-Quantized Representation (SpQR), a new compressed format and quantization technique that enables for the first time \\emph{near-lossless} compression of LLMs across model scales while reaching similar compression levels to previous methods. SpQR works by identifying and isolating \\emph{outlier weights}, which cause particularly large quantization errors, and storing them in higher precision while compressing all other weights to 3-4 bits, and achieves relative accuracy losses of less than \r\n in perplexity for highly-accurate LLaMA and Falcon LLMs. This makes it possible to run a 33B parameter LLM on a single 24 GB consumer GPU without performance degradation at 15% speedup, thus making powerful LLMs available to consumers without any downsides. SpQR comes with efficient algorithms for both encoding weights into its format, as well as decoding them efficiently at runtime. Specifically, we provide an efficient GPU inference algorithm for SpQR, which yields faster inference than 16-bit baselines at similar accuracy while enabling memory compression gains of more than 4x.","lang":"eng"}]},{"abstract":[{"lang":"eng","text":"We propose a new variant of the Adam optimizer [Kingma and Ba, 2014] called\r\nMICROADAM that specifically minimizes memory overheads, while maintaining\r\ntheoretical convergence guarantees. We achieve this by compressing the gradient\r\ninformation before it is fed into the optimizer state, thereby reducing its memory\r\nfootprint significantly. We control the resulting compression error via a novel\r\ninstance of the classical error feedback mechanism from distributed optimization [Seide et al., 2014, Alistarh et al., 2018, Karimireddy et al., 2019] in which\r\nthe error correction information is itself compressed to allow for practical memory\r\ngains. We prove that the resulting approach maintains theoretical convergence\r\nguarantees competitive to those of AMSGrad, while providing good practical performance. Specifically, we show that MICROADAM can be implemented efficiently\r\non GPUs: on both million-scale (BERT) and billion-scale (LLaMA) models, MICROADAM provides practical convergence competitive to that of the uncompressed\r\nAdam baseline, with lower memory usage and similar running time. Our code is\r\navailable at https://github.com/IST-DASLab/MicroAdam."}],"OA_type":"green","related_material":{"link":[{"url":"https://github.com/IST-DASLab/MicroAdam","relation":"software"}]},"external_id":{"arxiv":["2405.15593"]},"article_processing_charge":"No","date_created":"2025-04-06T22:01:32Z","date_published":"2024-12-20T00:00:00Z","OA_place":"repository","status":"public","acknowledgement":"The authors thank Razvan Pascanu, Mahdi Nikdan and Soroush Tabesh for their valuable feedback, the IT department from Institute of Science and Technology Austria for the hardware support and Weights and Biases for the infrastructure to track all our experiments. Mher Safaryan has received funding from the European Union’s Horizon 2020 research and innovation program under the Marie Sklodowska-Curie grant agreement No 101034413.","publication_identifier":{"issn":["1049-5258"]},"publication":"38th Conference on Neural Information Processing Systems","language":[{"iso":"eng"}],"year":"2024","acknowledged_ssus":[{"_id":"CampIT"}],"quality_controlled":"1","author":[{"last_name":"Modoranu","full_name":"Modoranu, Ionut-Vlad","id":"449f7a18-f128-11eb-9611-9b430c0c6333","first_name":"Ionut-Vlad"},{"first_name":"Mher","id":"dd546b39-0804-11ed-9c55-ef075c39778d","full_name":"Safaryan, Mher","last_name":"Safaryan"},{"full_name":"Malinovsky, Grigory","first_name":"Grigory","last_name":"Malinovsky"},{"first_name":"Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","full_name":"Kurtic, Eldar","last_name":"Kurtic"},{"id":"de632733-1457-11f0-ae22-b5914b8c1c41","first_name":"Thomas","full_name":"Robert, Thomas","last_name":"Robert"},{"full_name":"Richtárik, Peter","first_name":"Peter","last_name":"Richtárik"},{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"oa_version":"Preprint","day":"20","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2405.15593","open_access":"1"}],"project":[{"grant_number":"101034413","name":"IST-BRIDGE: International postdoctoral program","_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c","call_identifier":"H2020"}],"volume":37,"scopus_import":"1","intvolume":"        37","publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"}],"publisher":"Neural Information Processing Systems Foundation","oa":1,"citation":{"mla":"Modoranu, Ionut-Vlad, et al. “MICROADAM: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024.","apa":"Modoranu, I.-V., Safaryan, M., Malinovsky, G., Kurtic, E., Robert, T., Richtárik, P., &#38; Alistarh, D.-A. (2024). MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Neural Information Processing Systems Foundation.","ieee":"I.-V. Modoranu <i>et al.</i>, “MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence,” in <i>38th Conference on Neural Information Processing Systems</i>, 2024, vol. 37.","ista":"Modoranu I-V, Safaryan M, Malinovsky G, Kurtic E, Robert T, Richtárik P, Alistarh D-A. 2024. MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence. 38th Conference on Neural Information Processing Systems. , Advances in Neural Information Processing Systems, vol. 37.","chicago":"Modoranu, Ionut-Vlad, Mher Safaryan, Grigory Malinovsky, Eldar Kurtic, Thomas Robert, Peter Richtárik, and Dan-Adrian Alistarh. “MICROADAM: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","ama":"Modoranu I-V, Safaryan M, Malinovsky G, et al. MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","short":"I.-V. Modoranu, M. Safaryan, G. Malinovsky, E. Kurtic, T. Robert, P. Richtárik, D.-A. Alistarh, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024."},"alternative_title":["Advances in Neural Information Processing Systems"],"corr_author":"1","title":"MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence","month":"12","date_updated":"2025-05-14T11:32:52Z","_id":"19510","ec_funded":1,"type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87"},{"publisher":"Neural Information Processing Systems Foundation","citation":{"chicago":"Ashkboos, Saleh, Amirkeivan Mohtashami, Maximilian L. Croci, Bo Li, Pashmina Cameron, Martin Jaggi, Dan-Adrian Alistarh, Torsten Hoefler, and James Hensman. “QuaRot: Outlier-Free 4-Bit Inference in Rotated LLMs.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","ama":"Ashkboos S, Mohtashami A, Croci ML, et al. QuaRot: Outlier-free 4-bit inference in rotated LLMs. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","short":"S. Ashkboos, A. Mohtashami, M.L. Croci, B. Li, P. Cameron, M. Jaggi, D.-A. Alistarh, T. Hoefler, J. Hensman, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","apa":"Ashkboos, S., Mohtashami, A., Croci, M. L., Li, B., Cameron, P., Jaggi, M., … Hensman, J. (2024). QuaRot: Outlier-free 4-bit inference in rotated LLMs. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation.","ista":"Ashkboos S, Mohtashami A, Croci ML, Li B, Cameron P, Jaggi M, Alistarh D-A, Hoefler T, Hensman J. 2024. QuaRot: Outlier-free 4-bit inference in rotated LLMs. 38th Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","ieee":"S. Ashkboos <i>et al.</i>, “QuaRot: Outlier-free 4-bit inference in rotated LLMs,” in <i>38th Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","mla":"Ashkboos, Saleh, et al. “QuaRot: Outlier-Free 4-Bit Inference in Rotated LLMs.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024."},"oa":1,"intvolume":"        37","department":[{"_id":"DaAl"}],"arxiv":1,"publication_status":"published","conference":{"location":"Vancouver, Canada","name":"NeurIPS: Neural Information Processing Systems","end_date":"2024-12-15","start_date":"2024-12-09"},"volume":37,"scopus_import":"1","day":"20","oa_version":"Preprint","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2404.00456","open_access":"1"}],"type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","month":"12","_id":"19511","date_updated":"2025-05-14T11:33:12Z","alternative_title":["Advances in Neural Information Processing Systems"],"title":"QuaRot: Outlier-free 4-bit inference in rotated LLMs","date_published":"2024-12-20T00:00:00Z","date_created":"2025-04-06T22:01:32Z","article_processing_charge":"No","OA_place":"repository","status":"public","related_material":{"link":[{"url":"https://github.com/spcl/QuaRot","relation":"software"}]},"external_id":{"arxiv":["2404.00456"]},"OA_type":"green","abstract":[{"text":"We introduce QuaRot, a new Quantization scheme based on Rotations, which is able to quantize LLMs end-to-end, including all weights, activations, and KV cache in 4 bits. QuaRot rotates LLMs in a way that removes outliers from the hidden state without changing the output, making quantization easier. This computational invariance is applied to the hidden state (residual) of the LLM, as well as to the activations of the feed-forward components, aspects of the attention mechanism, and to the KV cache. The result is a quantized model where all matrix multiplications are performed in 4 bits, without any channels identified for retention in higher precision. Our 4-bit quantized LLAMA2-70B model has losses of at most 0.47 WikiText-2 perplexity and retains 99% of the zero-shot performance. We also show that QuaRot can provide lossless 6 and 8 bit LLAMA-2 models without any calibration data using round-to-nearest quantization. Code is available at github.com/spcl/QuaRot.","lang":"eng"}],"quality_controlled":"1","author":[{"full_name":"Ashkboos, Saleh","first_name":"Saleh","last_name":"Ashkboos"},{"last_name":"Mohtashami","first_name":"Amirkeivan","full_name":"Mohtashami, Amirkeivan"},{"full_name":"Croci, Maximilian L.","first_name":"Maximilian L.","last_name":"Croci"},{"first_name":"Bo","full_name":"Li, Bo","last_name":"Li"},{"last_name":"Cameron","full_name":"Cameron, Pashmina","first_name":"Pashmina"},{"last_name":"Jaggi","full_name":"Jaggi, Martin","first_name":"Martin"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh"},{"last_name":"Hoefler","full_name":"Hoefler, Torsten","first_name":"Torsten"},{"first_name":"James","full_name":"Hensman, James","last_name":"Hensman"}],"publication_identifier":{"issn":["1049-5258"]},"publication":"38th Conference on Neural Information Processing Systems","language":[{"iso":"eng"}],"year":"2024"},{"date_updated":"2025-05-14T11:37:10Z","_id":"19518","month":"12","title":"The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information","alternative_title":["Advances in Neural Information Processing Systems"],"corr_author":"1","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","type":"conference","ec_funded":1,"scopus_import":"1","volume":37,"project":[{"grant_number":"101034413","name":"IST-BRIDGE: International postdoctoral program","_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c","call_identifier":"H2020"}],"conference":{"end_date":"2024-12-15","start_date":"2024-12-09","name":"NeurIPS: Neural Information Processing Systems","location":"Vancouver, Canada"},"day":"20","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2408.17163"}],"oa_version":"Preprint","oa":1,"citation":{"ista":"Wu D, Modoranu I-V, Safaryan M, Kuznedelev D, Alistarh D-A. 2024. The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. 38th Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","apa":"Wu, D., Modoranu, I.-V., Safaryan, M., Kuznedelev, D., &#38; Alistarh, D.-A. (2024). The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation.","ieee":"D. Wu, I.-V. Modoranu, M. Safaryan, D. Kuznedelev, and D.-A. Alistarh, “The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information,” in <i>38th Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","mla":"Wu, Diyuan, et al. “The Iterative Optimal Brain Surgeon: Faster Sparse Recovery by Leveraging Second-Order Information.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024.","ama":"Wu D, Modoranu I-V, Safaryan M, Kuznedelev D, Alistarh D-A. The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","short":"D. Wu, I.-V. Modoranu, M. Safaryan, D. Kuznedelev, D.-A. Alistarh, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","chicago":"Wu, Diyuan, Ionut-Vlad Modoranu, Mher Safaryan, Denis Kuznedelev, and Dan-Adrian Alistarh. “The Iterative Optimal Brain Surgeon: Faster Sparse Recovery by Leveraging Second-Order Information.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024."},"publisher":"Neural Information Processing Systems Foundation","arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"},{"_id":"MaMo"}],"intvolume":"        37","language":[{"iso":"eng"}],"year":"2024","publication":"38th Conference on Neural Information Processing Systems","publication_identifier":{"issn":["1049-5258"]},"acknowledgement":"The authors thank the anonymous NeurIPS reviewers for their useful comments and feedback, the IT department from the Institute of Science and Technology Austria for the hardware support, and Weights and Biases for the infrastructure to track all our experiments. Mher Safaryan has received funding from the European Union’s Horizon 2020 research and innovation program under the Maria Skłodowska-Curie grant agreement No 101034413.","author":[{"last_name":"Wu","id":"1a5914c2-896a-11ed-bdf8-fb80621a0635","first_name":"Diyuan","full_name":"Wu, Diyuan"},{"first_name":"Ionut-Vlad","id":"449f7a18-f128-11eb-9611-9b430c0c6333","full_name":"Modoranu, Ionut-Vlad","last_name":"Modoranu"},{"first_name":"Mher","id":"dd546b39-0804-11ed-9c55-ef075c39778d","full_name":"Safaryan, Mher","last_name":"Safaryan"},{"last_name":"Kuznedelev","first_name":"Denis","full_name":"Kuznedelev, Denis"},{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"quality_controlled":"1","acknowledged_ssus":[{"_id":"CampIT"}],"abstract":[{"lang":"eng","text":"The rising footprint of machine learning has led to a focus on imposing model\r\nsparsity as a means of reducing computational and memory costs. For deep neural\r\nnetworks (DNNs), the state-of-the-art accuracy-vs-sparsity is achieved by heuristics\r\ninspired by the classical Optimal Brain Surgeon (OBS) framework [LeCun et al.,\r\n1989, Hassibi and Stork, 1992, Hassibi et al., 1993], which leverages loss curvature\r\ninformation to make better pruning decisions. Yet, these results still lack a solid\r\ntheoretical understanding, and it is unclear whether they can be improved by\r\nleveraging connections to the wealth of work on sparse recovery algorithms. In this\r\npaper, we draw new connections between these two areas and present new sparse\r\nrecovery algorithms inspired by the OBS framework that comes with theoretical\r\nguarantees under reasonable assumptions and have strong practical performance.\r\nSpecifically, our work starts from the observation that we can leverage curvature\r\ninformation in OBS-like fashion upon the projection step of classic iterative sparse\r\nrecovery algorithms such as IHT. We show for the first time that this leads both\r\nto improved convergence bounds under standard assumptions. Furthermore, we\r\npresent extensions of this approach to the practical task of obtaining accurate sparse\r\nDNNs, and validate it experimentally at scale for Transformer-based models on\r\nvision and language tasks."}],"OA_type":"green","status":"public","OA_place":"repository","article_processing_charge":"No","date_published":"2024-12-20T00:00:00Z","date_created":"2025-04-06T22:01:32Z","external_id":{"arxiv":["2408.17163"]}},{"publication_identifier":{"isbn":["9798331314385"],"issn":["1049-5258"]},"year":"2024","publication":"38th Conference on Neural Information Processing Systems","language":[{"iso":"eng"}],"acknowledgement":"Authors would like to thank Vage Egiazarian, Andrei Panferov and Ruslan Svirschevski for their\r\nhelp and advice on AQLM codebase and running large-scale experiments. We also thank Philip\r\nZmushko and Artem Fedorov for helpful discussions during the early stages of our research. The research of Kai Yi, Konstantin Burlachenko, and Peter Richtárik reported in this publication was supported by funding from King Abdullah University of Science and Technology (KAUST) – Center of Excellence for Generative AI, under award number 5940. We would also like to thank our NeurIPS reviewers for their helpful suggestions, we specifically highlight p3Lv’s suggestions to consider smaller codebook sizes and evaluate PV-Tuning with QuIP#, both of which produced interesting findings. Finally, we thank the open-source contributors from llama.cpp9 and the LocalLlama10 community for discussions and inspirations on practical use cases of quantized language models, and in particular, Yalda Shabanzadeh and Arthur Aardvark for their help with improving the codebase.","file_date_updated":"2025-04-07T09:17:10Z","quality_controlled":"1","author":[{"last_name":"Malinovskii","full_name":"Malinovskii, Vladimir","first_name":"Vladimir"},{"first_name":"Denis","full_name":"Mazur, Denis","last_name":"Mazur"},{"last_name":"Ilin","full_name":"Ilin, Ivan","first_name":"Ivan"},{"first_name":"Denis","full_name":"Kuznedelev, Denis","last_name":"Kuznedelev"},{"full_name":"Burlachenko, Konstantin","first_name":"Konstantin","last_name":"Burlachenko"},{"last_name":"Yi","first_name":"Kai","full_name":"Yi, Kai"},{"last_name":"Alistarh","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian"},{"full_name":"Richtarik, Peter","first_name":"Peter","last_name":"Richtarik"}],"file":[{"date_updated":"2025-04-07T09:17:10Z","file_size":939712,"file_id":"19521","success":1,"creator":"dernst","access_level":"open_access","content_type":"application/pdf","checksum":"54d36f947887e26d0e568b512167001a","file_name":"2024_NeurIPS_Malinovskii.pdf","date_created":"2025-04-07T09:17:10Z","relation":"main_file"}],"abstract":[{"text":"There has been significant interest in \"extreme\" compression of large language models (LLMs), i.e. to 1-2 bits per parameter, which allows such models to be executed efficiently on resource-constrained devices. Existing work focused on improved one-shot quantization techniques and weight representations; yet, purely post-training approaches are reaching diminishing returns in terms of the accuracy-vs-bit-width trade-off. State-of-the-art quantization methods such as QuIP# and AQLM include fine-tuning (part of) the compressed parameters over a limited amount of calibration data; however, such fine-tuning techniques over compressed weights often make exclusive use of straight-through estimators (STE), whose performance is not well-understood in this setting. In this work, we question the use of STE for extreme LLM compression, showing that it can be sub-optimal, and perform a systematic study of quantization-aware fine-tuning strategies for LLMs.We propose PV-Tuning - a representation-agnostic framework that generalizes and improves upon existing fine-tuning strategies, and provides convergence guarantees in restricted cases.On the practical side, when used for 1-2 bit vector quantization, PV-Tuning outperforms prior techniques for highly-performant models such as Llama and Mistral. Using PV-Tuning, we achieve the first Pareto-optimal quantization for Llama-2 family models at 2 bits per parameter.","lang":"eng"}],"OA_type":"gold","article_processing_charge":"No","date_published":"2024-12-20T00:00:00Z","date_created":"2025-04-06T22:01:32Z","OA_place":"publisher","status":"public","external_id":{"arxiv":["2405.14852"]},"month":"12","date_updated":"2025-05-14T10:49:20Z","_id":"19519","alternative_title":["Advances in Neural Information Processing Systems"],"title":"PV-tuning: Beyond straight-through estimation for extreme LLM compression","ddc":["000"],"type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","conference":{"start_date":"2024-12-10","end_date":"2024-12-15","location":"Vancouver, Canada","name":"NeurIPS: Neural Information Processing Systems"},"has_accepted_license":"1","volume":37,"scopus_import":"1","day":"20","oa_version":"Published Version","publisher":"Neural Information Processing Systems Foundation","oa":1,"citation":{"mla":"Malinovskii, Vladimir, et al. “PV-Tuning: Beyond Straight-through Estimation for Extreme LLM Compression.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024.","ieee":"V. Malinovskii <i>et al.</i>, “PV-tuning: Beyond straight-through estimation for extreme LLM compression,” in <i>38th Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","ista":"Malinovskii V, Mazur D, Ilin I, Kuznedelev D, Burlachenko K, Yi K, Alistarh D-A, Richtarik P. 2024. PV-tuning: Beyond straight-through estimation for extreme LLM compression. 38th Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","apa":"Malinovskii, V., Mazur, D., Ilin, I., Kuznedelev, D., Burlachenko, K., Yi, K., … Richtarik, P. (2024). PV-tuning: Beyond straight-through estimation for extreme LLM compression. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation.","chicago":"Malinovskii, Vladimir, Denis Mazur, Ivan Ilin, Denis Kuznedelev, Konstantin Burlachenko, Kai Yi, Dan-Adrian Alistarh, and Peter Richtarik. “PV-Tuning: Beyond Straight-through Estimation for Extreme LLM Compression.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","short":"V. Malinovskii, D. Mazur, I. Ilin, D. Kuznedelev, K. Burlachenko, K. Yi, D.-A. Alistarh, P. Richtarik, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","ama":"Malinovskii V, Mazur D, Ilin I, et al. PV-tuning: Beyond straight-through estimation for extreme LLM compression. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024."},"intvolume":"        37","publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"}]},{"day":"08","main_file_link":[{"open_access":"1","url":"https://proceedings.mlr.press/v234/kurtic24a"}],"oa_version":"Preprint","conference":{"location":"Hongkong, China","name":"CPAL: Conference on Parsimony and Learning","end_date":"2024-01-06","start_date":"2024-01-03"},"scopus_import":"1","volume":234,"intvolume":"       234","publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"}],"publisher":"ML Research Press","oa":1,"citation":{"short":"E. Kurtic, T. Hoefler, D.-A. Alistarh, in:, Proceedings of Machine Learning Research, ML Research Press, 2024, pp. 542–553.","ama":"Kurtic E, Hoefler T, Alistarh D-A. How to prune your language model: Recovering accuracy on the “Sparsity May Cry” benchmark. In: <i>Proceedings of Machine Learning Research</i>. Vol 234. ML Research Press; 2024:542-553.","chicago":"Kurtic, Eldar, Torsten Hoefler, and Dan-Adrian Alistarh. “How to Prune Your Language Model: Recovering Accuracy on the ‘Sparsity May Cry’ Benchmark.” In <i>Proceedings of Machine Learning Research</i>, 234:542–53. ML Research Press, 2024.","mla":"Kurtic, Eldar, et al. “How to Prune Your Language Model: Recovering Accuracy on the ‘Sparsity May Cry’ Benchmark.” <i>Proceedings of Machine Learning Research</i>, vol. 234, ML Research Press, 2024, pp. 542–53.","ieee":"E. Kurtic, T. Hoefler, and D.-A. Alistarh, “How to prune your language model: Recovering accuracy on the ‘Sparsity May Cry’ benchmark,” in <i>Proceedings of Machine Learning Research</i>, Hongkong, China, 2024, vol. 234, pp. 542–553.","apa":"Kurtic, E., Hoefler, T., &#38; Alistarh, D.-A. (2024). How to prune your language model: Recovering accuracy on the “Sparsity May Cry” benchmark. In <i>Proceedings of Machine Learning Research</i> (Vol. 234, pp. 542–553). Hongkong, China: ML Research Press.","ista":"Kurtic E, Hoefler T, Alistarh D-A. 2024. How to prune your language model: Recovering accuracy on the ‘Sparsity May Cry’ benchmark. Proceedings of Machine Learning Research. CPAL: Conference on Parsimony and Learning, PMLR, vol. 234, 542–553."},"alternative_title":["PMLR"],"corr_author":"1","title":"How to prune your language model: Recovering accuracy on the \"Sparsity May Cry\" benchmark","month":"01","date_updated":"2024-10-09T21:08:16Z","_id":"15011","type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","abstract":[{"lang":"eng","text":"Pruning large language models (LLMs) from the BERT family has emerged as a standard compression benchmark, and several pruning methods have been proposed for this task. The recent “Sparsity May Cry” (SMC) benchmark put into question the validity of all existing methods, exhibiting a more complex setup where many known pruning methods appear to fail. We revisit the question of accurate BERT-pruning during fine-tuning on downstream datasets, and propose a set of general guidelines for successful pruning, even on the challenging SMC benchmark. First, we perform a cost-vs-benefits analysis of pruning model components, such as the embeddings and the classification head; second, we provide a simple-yet-general way of scaling training, sparsification and learning rate schedules relative to the desired target sparsity; finally, we investigate the importance of proper parametrization for Knowledge Distillation in the context of LLMs. Our simple insights lead to state-of-the-art results, both on classic BERT-pruning benchmarks, as well as on the SMC benchmark, showing that even classic gradual magnitude pruning (GMP) can yield competitive results, with the right approach."}],"page":"542-553","external_id":{"arxiv":["2312.13547"]},"article_processing_charge":"No","date_created":"2024-02-18T23:01:03Z","date_published":"2024-01-08T00:00:00Z","status":"public","publication_identifier":{"eissn":["2640-3498"]},"language":[{"iso":"eng"}],"year":"2024","publication":"Proceedings of Machine Learning Research","quality_controlled":"1","author":[{"last_name":"Kurtic","full_name":"Kurtic, Eldar","first_name":"Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218"},{"last_name":"Hoefler","first_name":"Torsten","full_name":"Hoefler, Torsten"},{"last_name":"Alistarh","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X"}]},{"conference":{"start_date":"2024-05-02","end_date":"2024-05-04","location":"Valencia, Spain","name":"AISTATS: Conference on Artificial Intelligence and Statistics"},"scopus_import":"1","volume":238,"day":"01","oa_version":"Preprint","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2206.10032","open_access":"1"}],"publisher":"ML Research Press","citation":{"chicago":"Zakerinia, Hossein, Shayan Talaei, Giorgi Nadiradze, and Dan-Adrian Alistarh. “Communication-Efficient Federated Learning with Data and Client Heterogeneity.” In <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, 238:3448–56. ML Research Press, 2024.","ama":"Zakerinia H, Talaei S, Nadiradze G, Alistarh D-A. Communication-efficient federated learning with data and client heterogeneity. In: <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>. Vol 238. ML Research Press; 2024:3448-3456.","short":"H. Zakerinia, S. Talaei, G. Nadiradze, D.-A. Alistarh, in:, Proceedings of the 27th International Conference on Artificial Intelligence and Statistics, ML Research Press, 2024, pp. 3448–3456.","mla":"Zakerinia, Hossein, et al. “Communication-Efficient Federated Learning with Data and Client Heterogeneity.” <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, vol. 238, ML Research Press, 2024, pp. 3448–56.","apa":"Zakerinia, H., Talaei, S., Nadiradze, G., &#38; Alistarh, D.-A. (2024). Communication-efficient federated learning with data and client heterogeneity. In <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i> (Vol. 238, pp. 3448–3456). Valencia, Spain: ML Research Press.","ista":"Zakerinia H, Talaei S, Nadiradze G, Alistarh D-A. 2024. Communication-efficient federated learning with data and client heterogeneity. Proceedings of the 27th International Conference on Artificial Intelligence and Statistics. AISTATS: Conference on Artificial Intelligence and Statistics, PMLR, vol. 238, 3448–3456.","ieee":"H. Zakerinia, S. Talaei, G. Nadiradze, and D.-A. Alistarh, “Communication-efficient federated learning with data and client heterogeneity,” in <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, Valencia, Spain, 2024, vol. 238, pp. 3448–3456."},"oa":1,"intvolume":"       238","department":[{"_id":"DaAl"},{"_id":"ChLa"}],"arxiv":1,"publication_status":"published","month":"05","_id":"17093","date_updated":"2024-10-09T21:08:57Z","corr_author":"1","alternative_title":["PMLR"],"title":"Communication-efficient federated learning with data and client heterogeneity","type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","page":"3448-3456","abstract":[{"text":"Federated Learning (FL) enables large-scale distributed training of machine learning models, while still allowing individual nodes to maintain data locally. However, executing FL at scale comes with inherent practical challenges: 1) heterogeneity of the local node data distributions, 2) heterogeneity of node computational speeds (asynchrony), but also 3) constraints in the amount of communication between the clients and the server. In this work, we present the first variant of the classic federated averaging (FedAvg) algorithm which, at the same time, supports data heterogeneity, partial client asynchrony, and communication compression. Our algorithm comes with a novel, rigorous analysis showing that, in spite of these system relaxations, it can provide similar convergence to FedAvg in interesting parameter regimes. Experimental results in the rigorous LEAF benchmark on setups of up to 300 nodes show that our algorithm ensures fast convergence for standard federated tasks, improving upon prior quantized and asynchronous approaches.","lang":"eng"}],"date_published":"2024-05-01T00:00:00Z","date_created":"2024-06-02T22:00:57Z","article_processing_charge":"No","status":"public","external_id":{"arxiv":["2206.10032"]},"publication_identifier":{"eissn":["2640-3498"]},"language":[{"iso":"eng"}],"year":"2024","publication":"Proceedings of the 27th International Conference on Artificial Intelligence and Statistics","quality_controlled":"1","author":[{"first_name":"Hossein","id":"653bd8b6-f394-11eb-9cf6-c0bbf6cd78d4","full_name":"Zakerinia, Hossein","last_name":"Zakerinia"},{"last_name":"Talaei","first_name":"Shayan","full_name":"Talaei, Shayan"},{"id":"3279A00C-F248-11E8-B48F-1D18A9856A87","first_name":"Giorgi","orcid":"0000-0001-5634-0731","full_name":"Nadiradze, Giorgi","last_name":"Nadiradze"},{"last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian"}]},{"date_updated":"2025-04-14T07:52:47Z","_id":"17329","month":"06","title":"Game dynamics and equilibrium computation in the population protocol model","ddc":["000"],"corr_author":"1","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"type":"conference","ec_funded":1,"has_accepted_license":"1","scopus_import":"1","conference":{"location":"Nantes, France","name":"PODC: Symposium on Principles of Distributed Computing","end_date":"2024-06-21","start_date":"2024-06-17"},"project":[{"_id":"0599E47C-7A3F-11EA-A408-12923DDC885E","call_identifier":"H2020","name":"Formal Methods for Stochastic Models: Algorithms and Applications","grant_number":"863818"}],"day":"17","oa_version":"Published Version","doi":"10.1145/3662158.3662768","oa":1,"citation":{"mla":"Alistarh, Dan-Adrian, et al. “Game Dynamics and Equilibrium Computation in the Population Protocol Model.” <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i>, Association for Computing Machinery, 2024, pp. 40–49, doi:<a href=\"https://doi.org/10.1145/3662158.3662768\">10.1145/3662158.3662768</a>.","ieee":"D.-A. Alistarh, K. Chatterjee, M. Karrabi, and J. M. Lazarsfeld, “Game dynamics and equilibrium computation in the population protocol model,” in <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i>, Nantes, France, 2024, pp. 40–49.","apa":"Alistarh, D.-A., Chatterjee, K., Karrabi, M., &#38; Lazarsfeld, J. M. (2024). Game dynamics and equilibrium computation in the population protocol model. In <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i> (pp. 40–49). Nantes, France: Association for Computing Machinery. <a href=\"https://doi.org/10.1145/3662158.3662768\">https://doi.org/10.1145/3662158.3662768</a>","ista":"Alistarh D-A, Chatterjee K, Karrabi M, Lazarsfeld JM. 2024. Game dynamics and equilibrium computation in the population protocol model. Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing. PODC: Symposium on Principles of Distributed Computing, 40–49.","short":"D.-A. Alistarh, K. Chatterjee, M. Karrabi, J.M. Lazarsfeld, in:, Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing, Association for Computing Machinery, 2024, pp. 40–49.","ama":"Alistarh D-A, Chatterjee K, Karrabi M, Lazarsfeld JM. Game dynamics and equilibrium computation in the population protocol model. In: <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i>. Association for Computing Machinery; 2024:40-49. doi:<a href=\"https://doi.org/10.1145/3662158.3662768\">10.1145/3662158.3662768</a>","chicago":"Alistarh, Dan-Adrian, Krishnendu Chatterjee, Mehrdad Karrabi, and John M Lazarsfeld. “Game Dynamics and Equilibrium Computation in the Population Protocol Model.” In <i>Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing</i>, 40–49. Association for Computing Machinery, 2024. <a href=\"https://doi.org/10.1145/3662158.3662768\">https://doi.org/10.1145/3662158.3662768</a>."},"publisher":"Association for Computing Machinery","publication_status":"published","department":[{"_id":"DaAl"},{"_id":"KrCh"}],"year":"2024","publication":"Proceedings of the 43rd Annual ACM Symposium on Principles of Distributed Computing","language":[{"iso":"eng"}],"publication_identifier":{"isbn":["9798400706684"]},"acknowledgement":"This work was supported in part by the ERC-2020-CoG 863818 (FoRM-SMArt) grant. We thank James Aspnes and Thomas Sauerwald for several helpful discussions on Ehrenfest random walks.","file_date_updated":"2024-07-29T07:37:31Z","author":[{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh"},{"orcid":"0000-0002-4561-241X","full_name":"Chatterjee, Krishnendu","id":"2E5DCA20-F248-11E8-B48F-1D18A9856A87","first_name":"Krishnendu","last_name":"Chatterjee"},{"full_name":"Karrabi, Mehrdad","first_name":"Mehrdad","id":"67638922-f394-11eb-9cf6-f20423e08757","last_name":"Karrabi"},{"first_name":"John M","id":"17ce3656-183e-11ef-84c3-8932383e1b23","full_name":"Lazarsfeld, John M","last_name":"Lazarsfeld"}],"quality_controlled":"1","abstract":[{"text":"We initiate the study of game dynamics in the population protocol model: n agents each maintain a current local strategy and interact in pairs uniformly at random. Upon each interaction, the agents play a two-person game and receive a payoff from an underlying utility function, and they can subsequently update their strategies according to a fixed local algorithm. In this setting, we ask how the distribution over agent strategies evolves over a sequence of interactions, and we introduce a new distributional equilibrium concept to quantify the quality of such distributions. As an initial example, we study a class of repeated prisoner's dilemma games, and we consider a family of simple local update algorithms that yield non-trivial dynamics over the distribution of agent strategies. We show that these dynamics are related to a new class of high-dimensional Ehrenfest random walks, and we derive exact characterizations of their stationary distributions, bounds on their mixing times, and prove their convergence to approximate distributional equilibria. Our results highlight trade-offs between the local state space of each agent, and the convergence rate and approximation factor of the underlying dynamics. Our approach opens the door towards the further characterization of equilibrium computation for other classes of games and dynamics in the population setting.","lang":"eng"}],"page":"40-49","file":[{"file_id":"17335","file_size":750908,"date_updated":"2024-07-29T07:37:31Z","content_type":"application/pdf","access_level":"open_access","creator":"dernst","success":1,"date_created":"2024-07-29T07:37:31Z","checksum":"65a40437f83373fa79dd999d5287509e","file_name":"2024_ACMPODC_Alistarh.pdf","relation":"main_file"}],"status":"public","article_processing_charge":"Yes (via OA deal)","date_created":"2024-07-28T22:01:10Z","date_published":"2024-06-17T00:00:00Z"},{"author":[{"last_name":"Kokorin","first_name":"Ilya","full_name":"Kokorin, Ilya"},{"last_name":"Yudov","full_name":"Yudov, Victor","first_name":"Victor"},{"last_name":"Aksenov","first_name":"Vitaly","full_name":"Aksenov, Vitaly"},{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"quality_controlled":"1","acknowledgement":"We thank Trevor Brown and Yuanhao Wei for the discussion and anonymous reviewers for helping us to improve the paper. Also, we thank JetBrains and Huawei for their support.","isi":1,"language":[{"iso":"eng"}],"year":"2024","publication":"2024 IEEE International Parallel and Distributed Processing Symposium","publication_identifier":{"eissn":["1530-2075"],"isbn":["9798350337662"]},"external_id":{"isi":["001270389600078"],"arxiv":["2310.05293"]},"status":"public","article_processing_charge":"No","date_published":"2024-07-08T00:00:00Z","date_created":"2024-07-28T22:01:11Z","abstract":[{"text":"Tree data structures, such as red-black trees, quad trees, treaps, or tries, are fundamental tools in computer science. A classical problem in concurrency is to obtain expressive, efficient, and scalable versions of practical tree data structures. We are interested in concurrent trees supporting range queries, i.e., queries that involve multiple consecutive data items. Existing implementations with this capability can list keys in a specific range, but do not support aggregate range queries: for instance, if we want to calculate the number of keys in a range, the only choice is to retrieve a whole list and return its size. This is suboptimal: in the sequential setting, one can augment a balanced search tree with counters and, consequently, perform these aggregate requests in logarithmic rather than linear time.In this paper, we propose a generic approach to implement a broad class of range queries on concurrent trees in a way that is wait-free, asymptotically efficient, and practically scalable. The key idea is a new mechanism for maintaining metadata concurrently at tree nodes, which can be seen as a wait-free variant of hand-over-hand locking (which we call hand-over-hand helping). We did a preliminary implementation of the wait-free binary search tree and preliminary experiments have indicated the soundness of our approach.","lang":"eng"}],"page":"169-179","user_id":"317138e5-6ab7-11ef-aa6d-ffef3953e345","type":"conference","title":"Wait-free trees with asymptotically-efficient range queries","corr_author":"1","date_updated":"2025-09-08T08:29:45Z","_id":"17332","month":"07","arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"}],"oa":1,"citation":{"chicago":"Kokorin, Ilya, Victor Yudov, Vitaly Aksenov, and Dan-Adrian Alistarh. “Wait-Free Trees with Asymptotically-Efficient Range Queries.” In <i>2024 IEEE International Parallel and Distributed Processing Symposium</i>, 169–79. IEEE, 2024. <a href=\"https://doi.org/10.1109/IPDPS57955.2024.00023\">https://doi.org/10.1109/IPDPS57955.2024.00023</a>.","short":"I. Kokorin, V. Yudov, V. Aksenov, D.-A. Alistarh, in:, 2024 IEEE International Parallel and Distributed Processing Symposium, IEEE, 2024, pp. 169–179.","ama":"Kokorin I, Yudov V, Aksenov V, Alistarh D-A. Wait-free trees with asymptotically-efficient range queries. In: <i>2024 IEEE International Parallel and Distributed Processing Symposium</i>. IEEE; 2024:169-179. doi:<a href=\"https://doi.org/10.1109/IPDPS57955.2024.00023\">10.1109/IPDPS57955.2024.00023</a>","mla":"Kokorin, Ilya, et al. “Wait-Free Trees with Asymptotically-Efficient Range Queries.” <i>2024 IEEE International Parallel and Distributed Processing Symposium</i>, IEEE, 2024, pp. 169–79, doi:<a href=\"https://doi.org/10.1109/IPDPS57955.2024.00023\">10.1109/IPDPS57955.2024.00023</a>.","ista":"Kokorin I, Yudov V, Aksenov V, Alistarh D-A. 2024. Wait-free trees with asymptotically-efficient range queries. 2024 IEEE International Parallel and Distributed Processing Symposium. IPDPS: International Parallel and Distributed Processing Symposium, 169–179.","ieee":"I. Kokorin, V. Yudov, V. Aksenov, and D.-A. Alistarh, “Wait-free trees with asymptotically-efficient range queries,” in <i>2024 IEEE International Parallel and Distributed Processing Symposium</i>, San Francisco, CA, United States, 2024, pp. 169–179.","apa":"Kokorin, I., Yudov, V., Aksenov, V., &#38; Alistarh, D.-A. (2024). Wait-free trees with asymptotically-efficient range queries. In <i>2024 IEEE International Parallel and Distributed Processing Symposium</i> (pp. 169–179). San Francisco, CA, United States: IEEE. <a href=\"https://doi.org/10.1109/IPDPS57955.2024.00023\">https://doi.org/10.1109/IPDPS57955.2024.00023</a>"},"publisher":"IEEE","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2310.05293"}],"oa_version":"Preprint","day":"08","doi":"10.1109/IPDPS57955.2024.00023","scopus_import":"1","conference":{"location":"San Francisco, CA, United States","name":"IPDPS: International Parallel and Distributed Processing Symposium","start_date":"2024-05-27","end_date":"2024-05-31"}},{"corr_author":"1","alternative_title":["ISTA Thesis"],"ddc":["000"],"title":"Compressing large neural networks : Algorithms, systems and scaling laws","month":"09","_id":"17485","date_updated":"2026-04-07T12:43:04Z","ec_funded":1,"type":"dissertation","user_id":"ba8df636-2132-11f1-aed0-ed93e2281fdd","doi":"10.15479/at:ista:17485","day":"05","oa_version":"Published Version","project":[{"_id":"268A44D6-B435-11E9-9278-68D0E5697425","call_identifier":"H2020","name":"Elastic Coordination for Scalable Machine Learning","grant_number":"805223"}],"has_accepted_license":"1","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"publication_status":"published","publisher":"Institute of Science and Technology Austria","citation":{"ieee":"E. Frantar, “Compressing large neural networks : Algorithms, systems and scaling laws,” Institute of Science and Technology Austria, 2024.","ista":"Frantar E. 2024. Compressing large neural networks : Algorithms, systems and scaling laws. Institute of Science and Technology Austria.","apa":"Frantar, E. (2024). <i>Compressing large neural networks : Algorithms, systems and scaling laws</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/at:ista:17485\">https://doi.org/10.15479/at:ista:17485</a>","mla":"Frantar, Elias. <i>Compressing Large Neural Networks : Algorithms, Systems and Scaling Laws</i>. Institute of Science and Technology Austria, 2024, doi:<a href=\"https://doi.org/10.15479/at:ista:17485\">10.15479/at:ista:17485</a>.","ama":"Frantar E. Compressing large neural networks : Algorithms, systems and scaling laws. 2024. doi:<a href=\"https://doi.org/10.15479/at:ista:17485\">10.15479/at:ista:17485</a>","short":"E. Frantar, Compressing Large Neural Networks : Algorithms, Systems and Scaling Laws, Institute of Science and Technology Austria, 2024.","chicago":"Frantar, Elias. “Compressing Large Neural Networks : Algorithms, Systems and Scaling Laws.” Institute of Science and Technology Austria, 2024. <a href=\"https://doi.org/10.15479/at:ista:17485\">https://doi.org/10.15479/at:ista:17485</a>."},"oa":1,"file_date_updated":"2024-09-06T16:24:59Z","publication_identifier":{"issn":["2663-337X"]},"supervisor":[{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","last_name":"Alistarh"}],"year":"2024","language":[{"iso":"eng"}],"acknowledged_ssus":[{"_id":"ScienComp"}],"author":[{"full_name":"Frantar, Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","first_name":"Elias","last_name":"Frantar"}],"degree_awarded":"PhD","file":[{"date_updated":"2024-09-05T12:04:11Z","file_size":1615167,"file_id":"17570","creator":"efrantar","access_level":"closed","content_type":"application/zip","checksum":"5d785645805a78c5b4ce7cc3df557b09","file_name":"thesis-final.zip","date_created":"2024-09-05T12:04:11Z","relation":"source_file"},{"file_id":"17880","date_updated":"2024-09-06T16:24:59Z","file_size":2376611,"content_type":"application/pdf","success":1,"creator":"efrantar","access_level":"open_access","date_created":"2024-09-06T16:24:59Z","checksum":"a9dd1c2d23734986924eb44ebb55fd8f","file_name":"frantar_thesis_final.pdf","relation":"main_file"}],"page":"129","abstract":[{"lang":"eng","text":"Large language models (LLMs) have made tremendous progress in the past few years, from being able to generate coherent text to matching or surpassing humans in a wide variety of creative, knowledge or reasoning tasks. Much of this can be attributed to massively increased scale, both in the size of the model as well as the amount of training data, from 100s of millions to 100s of billions, or even trillions. This trend is expected to continue, which, although exciting, also raises major practical concerns. Already today's 100+ billion parameter LLMs require top-of-the-line hardware just to run. Hence, it is clear that sustaining these developments will require significant efficiency advances.\r\n\r\nHistorically, one of the most practical ways of improving model efficiency has been compression, especially in the form of sparsity or quantization. While this has been studied extensively in the past, existing accurate methods are all designed for models around 100 million parameters; scaling them up to ones literally 1000x larger is highly challenging. In this thesis, we introduce a new unified sparsification and quantization approach OBC, which through additional algorithmic enhancements leads to GPTQ and SparseGPT, the first techniques fast and accurate enough to compress 100+ billion parameter models to 4- or even 3-bit precision and 50% weight-sparsity, respectively. Additionally, we show how weight-only quantizion does not just bring space savings but also up to 4.5x faster generation speed, via custom GPU kernels.\r\n\r\nIn fact, we show for the first time that it is possible to develop an FP16 times INT4 mixed-precision matrix multiplication kernel, called Marlin, which comes close to simultaneously maximizing both memory and compute utilization, making weight-only quantization highly practical even for multi-user serving. Further, we demonstrate that GPTQ can be scaled to widely overparametrized trillion-parameter models, where extreme sub-1-bit compression rates can be achieved without any inference slow-down, by co-designing a bespoke entropy coding scheme together with an efficient kernel.\r\n\r\nFinally, we also study compression from the perspective of someone with access to massive amounts of compute resources for training large models completely from scratch. Here the key questions evolve around the joint scaling behavior between compression, model size, and amount of training data used. Based on extensive experimental results for both vision and text models, we introduce the first scaling law which accurately captures the relationship between weight-sparsity, number of non-zero weights and data. This further allows us to characterize the optimal sparsity, which we find to increase the longer a fixed cost model is being trained.\r\n\r\nOverall, this thesis presents contributions to three different angles of large model efficiency: affordable but accurate algorithms, highly efficient systems implementations, and fundamental scaling laws for compressed training."}],"related_material":{"record":[{"status":"public","id":"18062","relation":"part_of_dissertation"},{"relation":"part_of_dissertation","id":"18061","status":"public"},{"status":"public","id":"17378","relation":"part_of_dissertation"},{"relation":"part_of_dissertation","status":"public","id":"17087"},{"status":"public","id":"14458","relation":"part_of_dissertation"}]},"date_created":"2024-09-02T11:01:48Z","date_published":"2024-09-05T00:00:00Z","article_processing_charge":"No","status":"public","OA_place":"publisher"},{"publication":" Proceedings of Machine Learning and Systems","_id":"18061","year":"2024","language":[{"iso":"eng"}],"date_updated":"2026-04-07T12:43:03Z","month":"05","title":"QMoE: Sub-1-bit compression of trillion parameter models","corr_author":"1","author":[{"last_name":"Frantar","full_name":"Frantar, Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","first_name":"Elias"},{"last_name":"Alistarh","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian"}],"user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","quality_controlled":"1","type":"conference","volume":6,"abstract":[{"lang":"eng","text":"Mixture-of-Experts (MoE) architectures offer a general solution to the high inference costs of large language models (LLMs) via sparse routing, bringing faster and more accurate models, at the cost of massive parameter counts. For example, the SwitchTransformer-c2048 model has 1.6 trillion parameters, requiring 3.2TB of accelerator memory to run efficiently, which makes practical deployment challenging and expensive. In this paper, we present a solution to this memory problem, in form of a new compression and execution framework called QMoE. Specifically, QMoE consists of a scalable algorithm which accurately compresses trillion-parameter MoEs to less than 1 bit per parameter, in a custom format co-designed with bespoke GPU decoding kernels to facilitate efficient end-to-end compressed inference, with minor runtime overheads relative to uncompressed execution. Concretely, QMoE can compress the 1.6 trillion parameter SwitchTransformer-c2048 model to less than 160GB (20x compression, 0.8 bits per parameter) at only minor accuracy loss, in less than a day on a single GPU. This enables, for the first time, the execution of a trillion-parameter model on affordable commodity hardware, like a single server with 4x NVIDIA A6000 or 8x NVIDIA 3090 GPUs, at less than 5% runtime overhead relative to ideal uncompressed inference. The anonymized code is available at: github.com/mlsys24-qmoe/qmoe."}],"conference":{"name":"MLSys: Machine Learning and Systems","location":"Santa Clara, CA, USA","start_date":"2024-05-13","end_date":"2024-05-16"},"day":"01","oa_version":"Published Version","main_file_link":[{"url":"https://proceedings.mlsys.org/paper_files/paper/2024/hash/c74b624843218d9b6713fcf299d6d5e4-Abstract-Conference.html","open_access":"1"}],"status":"public","citation":{"chicago":"Frantar, Elias, and Dan-Adrian Alistarh. “QMoE: Sub-1-Bit Compression of Trillion Parameter Models.” In <i> Proceedings of Machine Learning and Systems</i>, edited by P. Gibbons, G. Pekhimenko, and C. De Sa, Vol. 6, 2024.","ama":"Frantar E, Alistarh D-A. QMoE: Sub-1-bit compression of trillion parameter models. In: Gibbons P, Pekhimenko G, De Sa C, eds. <i> Proceedings of Machine Learning and Systems</i>. Vol 6. ; 2024.","short":"E. Frantar, D.-A. Alistarh, in:, P. Gibbons, G. Pekhimenko, C. De Sa (Eds.),  Proceedings of Machine Learning and Systems, 2024.","mla":"Frantar, Elias, and Dan-Adrian Alistarh. “QMoE: Sub-1-Bit Compression of Trillion Parameter Models.” <i> Proceedings of Machine Learning and Systems</i>, edited by P. Gibbons et al., vol. 6, 2024.","ieee":"E. Frantar and D.-A. Alistarh, “QMoE: Sub-1-bit compression of trillion parameter models,” in <i> Proceedings of Machine Learning and Systems</i>, Santa Clara, CA, USA, 2024, vol. 6.","apa":"Frantar, E., &#38; Alistarh, D.-A. (2024). QMoE: Sub-1-bit compression of trillion parameter models. In P. Gibbons, G. Pekhimenko, &#38; C. De Sa (Eds.), <i> Proceedings of Machine Learning and Systems</i> (Vol. 6). Santa Clara, CA, USA.","ista":"Frantar E, Alistarh D-A. 2024. QMoE: Sub-1-bit compression of trillion parameter models.  Proceedings of Machine Learning and Systems. MLSys: Machine Learning and Systems vol. 6."},"oa":1,"date_created":"2024-09-13T10:01:38Z","date_published":"2024-05-01T00:00:00Z","article_processing_charge":"No","department":[{"_id":"DaAl"}],"publication_status":"published","intvolume":"         6","editor":[{"first_name":"P.","full_name":"Gibbons, P.","last_name":"Gibbons"},{"full_name":"Pekhimenko, G.","first_name":"G.","last_name":"Pekhimenko"},{"last_name":"De Sa","full_name":"De Sa, C.","first_name":"C."}],"related_material":{"record":[{"id":"17485","status":"public","relation":"dissertation_contains"}]}},{"quality_controlled":"1","type":"conference","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","author":[{"id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","first_name":"Elias","full_name":"Frantar, Elias","last_name":"Frantar"},{"first_name":"Carlos Riquelme","full_name":"Ruiz, Carlos Riquelme","last_name":"Ruiz"},{"full_name":"Houlsby, Neil","first_name":"Neil","last_name":"Houlsby"},{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"},{"last_name":"Evci","full_name":"Evci, Utku","first_name":"Utku"}],"corr_author":"1","title":"Scaling laws for sparsely-connected foundation models","month":"01","date_updated":"2026-04-07T12:43:03Z","language":[{"iso":"eng"}],"_id":"18062","year":"2024","publication":"The Twelfth International Conference on Learning Representations","related_material":{"record":[{"relation":"dissertation_contains","id":"17485","status":"public"}]},"external_id":{"arxiv":["2309.08520"]},"arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"}],"article_processing_charge":"No","date_created":"2024-09-13T10:31:08Z","date_published":"2024-01-16T00:00:00Z","oa":1,"citation":{"ama":"Frantar E, Ruiz CR, Houlsby N, Alistarh D-A, Evci U. Scaling laws for sparsely-connected foundation models. In: <i>The Twelfth International Conference on Learning Representations</i>. ; 2024.","short":"E. Frantar, C.R. Ruiz, N. Houlsby, D.-A. Alistarh, U. Evci, in:, The Twelfth International Conference on Learning Representations, 2024.","chicago":"Frantar, Elias, Carlos Riquelme Ruiz, Neil Houlsby, Dan-Adrian Alistarh, and Utku Evci. “Scaling Laws for Sparsely-Connected Foundation Models.” In <i>The Twelfth International Conference on Learning Representations</i>, 2024.","mla":"Frantar, Elias, et al. “Scaling Laws for Sparsely-Connected Foundation Models.” <i>The Twelfth International Conference on Learning Representations</i>, 2024.","apa":"Frantar, E., Ruiz, C. R., Houlsby, N., Alistarh, D.-A., &#38; Evci, U. (2024). Scaling laws for sparsely-connected foundation models. In <i>The Twelfth International Conference on Learning Representations</i>. Vienna, Austria.","ieee":"E. Frantar, C. R. Ruiz, N. Houlsby, D.-A. Alistarh, and U. Evci, “Scaling laws for sparsely-connected foundation models,” in <i>The Twelfth International Conference on Learning Representations</i>, Vienna, Austria, 2024.","ista":"Frantar E, Ruiz CR, Houlsby N, Alistarh D-A, Evci U. 2024. Scaling laws for sparsely-connected foundation models. The Twelfth International Conference on Learning Representations. ICLR: International Conference on Learning Representations."},"status":"public","main_file_link":[{"url":"https://openreview.net/forum?id=i9K2ZWkYIP","open_access":"1"}],"oa_version":"Published Version","day":"16","conference":{"end_date":"2024-05-07","start_date":"2024-05-07","name":"ICLR: International Conference on Learning Representations","location":"Vienna, Austria"},"abstract":[{"lang":"eng","text":"We explore the impact of parameter sparsity on the scaling behavior of Transformers trained on massive datasets (i.e., \"foundation models\"), in both vision and language domains. In this setting, we identify the first scaling law describing the relationship between weight sparsity, number of non-zero parameters, and amount of training data, which we validate empirically across model and data scales; on ViT/JFT-4B and T5/C4. These results allow us to characterize the \"optimal sparsity\", the sparsity level which yields the best performance for a given effective model size and training budget. For a fixed number of non-zero parameters, we identify that the optimal sparsity increases with the amount of data used for training. We also extend our study to different sparsity structures (such as the hardware-friendly n:m pattern) and strategies (such as starting from a pretrained dense model). Our findings shed light on the power and limitations of weight sparsity across various parameter and computational settings, offering both theoretical understanding and practical implications for leveraging sparsity towards computational efficiency improvements. We provide pruning and scaling law fitting code at: github.com/google-research/jaxpruner/tree/main/jaxpruner/projects/bigsparse."}],"scopus_import":"1"},{"oa_version":"Published Version","day":"04","doi":"10.15479/at:ista:17490","project":[{"call_identifier":"H2020","_id":"268A44D6-B435-11E9-9278-68D0E5697425","grant_number":"805223","name":"Elastic Coordination for Scalable Machine Learning"}],"has_accepted_license":"1","publication_status":"published","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"publisher":"Institute of Science and Technology Austria","oa":1,"citation":{"ista":"Markov I. 2024. Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective. Institute of Science and Technology Austria.","ieee":"I. Markov, “Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective,” Institute of Science and Technology Austria, 2024.","apa":"Markov, I. (2024). <i>Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/at:ista:17490\">https://doi.org/10.15479/at:ista:17490</a>","mla":"Markov, Ilia. <i>Communication-Efficient Distributed Training of Deep Neural Networks : An Algorithms and Systems Perspective</i>. Institute of Science and Technology Austria, 2024, doi:<a href=\"https://doi.org/10.15479/at:ista:17490\">10.15479/at:ista:17490</a>.","ama":"Markov I. Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective. 2024. doi:<a href=\"https://doi.org/10.15479/at:ista:17490\">10.15479/at:ista:17490</a>","short":"I. Markov, Communication-Efficient Distributed Training of Deep Neural Networks : An Algorithms and Systems Perspective, Institute of Science and Technology Austria, 2024.","chicago":"Markov, Ilia. “Communication-Efficient Distributed Training of Deep Neural Networks : An Algorithms and Systems Perspective.” Institute of Science and Technology Austria, 2024. <a href=\"https://doi.org/10.15479/at:ista:17490\">https://doi.org/10.15479/at:ista:17490</a>."},"alternative_title":["ISTA Thesis"],"corr_author":"1","ddc":["000"],"title":"Communication-efficient distributed training of deep neural networks : An algorithms and systems perspective","month":"09","date_updated":"2026-04-07T13:00:54Z","_id":"17490","ec_funded":1,"type":"dissertation","user_id":"ba8df636-2132-11f1-aed0-ed93e2281fdd","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode","short":"CC BY-NC-SA (4.0)","name":"Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)","image":"/images/cc_by_nc_sa.png"},"degree_awarded":"PhD","file":[{"file_size":43327753,"date_updated":"2024-09-04T08:35:35Z","file_id":"17491","access_level":"closed","creator":"imarkov","content_type":"application/x-zip-compressed","checksum":"77609f4835d2730e46fa0d42d9134ed9","file_name":"Thesis.zip","date_created":"2024-09-04T08:35:35Z","relation":"source_file"},{"checksum":"9e68f7217570f756ceb8f70b980938cd","file_name":"Thesis_final_version_pdfa2.pdf","date_created":"2024-09-04T08:36:06Z","relation":"main_file","file_size":2756082,"date_updated":"2024-09-04T08:36:06Z","file_id":"17492","creator":"imarkov","access_level":"open_access","success":1,"content_type":"application/pdf"}],"abstract":[{"lang":"eng","text":"Deep learning is essential in numerous applications nowadays, with many recent advancements made possible by training very large models. Despite their broad applicability, training neural networks is often time-intensive, and it is usually impractical to manage large models and datasets on a single machine. To address these issues, distributed deep learning training has become increasingly important. However, distributed training requires synchronization among nodes, and the mini-batch stochastic gradient descent algorithm places a significant load on network connections. A possible solution to tackle the synchronization bottleneck is to reduce a message size by lossy compression.\r\n\r\nIn this thesis, we investigate systems and algorithmic approaches to communication compression during training. From the systems perspective, we demonstrate that a common approach of expensive hardware overprovisioning can be replaced through a thorough system design. We introduce a framework that introduces efficient software support for compressed communication in machine learning applications, applicable to both multi-GPU single-node training and larger-scale multi-node training. Our framework integrates with popular ML frameworks, providing up to 3x speedups for multi-GPU nodes based on commodity hardware and order-of-magnitude improvements in the multi-node setting, with negligible impact on accuracy.\r\n\r\nAlso, we consider an application of our framework to different communication schemes, such as Fully Sharded Data Parallel. We provide strong convergence guarantees for the compression in such a setup. Empirical validation shows that our method preserves model accuracy for GPT-family models with up to 1.3 billion parameters, while completely removing the communication bottlenecks of non-compressed alternatives, providing up to 2.2x speedups end-to-end.\r\n\r\nFrom the algorithmic side, we propose a general framework that dynamically adjusts the degree of compression across a model's layers during training. This approach enhances overall compression and results in significant speedups without compromising accuracy. Our algorithm utilizes an adaptive algorithm that automatically selects the optimal compression parameters for model layers, ensuring the best compression ratio while adhering to an error constraint. Our method is effective across all existing families of compression methods. It achieves up to 2.5x faster training and up to a 5x improvement in compression compared to efficient implementations of current approaches. Additionally, LGreCo can complement existing adaptive algorithms.\r\n"}],"page":"102","related_material":{"record":[{"id":"17456","status":"public","relation":"part_of_dissertation"},{"relation":"part_of_dissertation","status":"public","id":"14461"},{"relation":"part_of_dissertation","status":"public","id":"12780"}]},"license":"https://creativecommons.org/licenses/by-nc-sa/4.0/","article_processing_charge":"No","date_created":"2024-09-04T08:51:11Z","date_published":"2024-09-04T00:00:00Z","status":"public","OA_place":"publisher","file_date_updated":"2024-09-04T08:36:06Z","supervisor":[{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"publication_identifier":{"issn":["2663-337X"]},"year":"2024","language":[{"iso":"eng"}],"acknowledged_ssus":[{"_id":"ScienComp"}],"author":[{"last_name":"Markov","full_name":"Markov, Ilia","first_name":"Ilia","id":"D0CF4148-C985-11E9-8066-0BDEE5697425"}]},{"oa":1,"citation":{"chicago":"Markov, Ilia, Kaveh Alimohammadi, Elias Frantar, and Dan-Adrian Alistarh. “L-GreCo: Layerwise-Adaptive Gradient Compression for Efficient Data-Parallel Deep Learning.” In <i>Proceedings of Machine Learning and Systems </i>, edited by P. Gibbons, G. Pekhimenko, and C. De Sa, Vol. 6. Association for Computing Machinery, 2024.","ama":"Markov I, Alimohammadi K, Frantar E, Alistarh D-A. L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning. In: Gibbons P, Pekhimenko G, De Sa C, eds. <i>Proceedings of Machine Learning and Systems </i>. Vol 6. Association for Computing Machinery; 2024.","short":"I. Markov, K. Alimohammadi, E. Frantar, D.-A. Alistarh, in:, P. Gibbons, G. Pekhimenko, C. De Sa (Eds.), Proceedings of Machine Learning and Systems , Association for Computing Machinery, 2024.","ieee":"I. Markov, K. Alimohammadi, E. Frantar, and D.-A. Alistarh, “L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning,” in <i>Proceedings of Machine Learning and Systems </i>, Athens, Greece, 2024, vol. 6.","ista":"Markov I, Alimohammadi K, Frantar E, Alistarh D-A. 2024. L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning. Proceedings of Machine Learning and Systems . MLSys: Machine Learning and Systems vol. 6.","apa":"Markov, I., Alimohammadi, K., Frantar, E., &#38; Alistarh, D.-A. (2024). L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning. In P. Gibbons, G. Pekhimenko, &#38; C. De Sa (Eds.), <i>Proceedings of Machine Learning and Systems </i> (Vol. 6). Athens, Greece: Association for Computing Machinery.","mla":"Markov, Ilia, et al. “L-GreCo: Layerwise-Adaptive Gradient Compression for Efficient Data-Parallel Deep Learning.” <i>Proceedings of Machine Learning and Systems </i>, edited by P. Gibbons et al., vol. 6, Association for Computing Machinery, 2024."},"publisher":"Association for Computing Machinery","publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"}],"intvolume":"         6","volume":6,"conference":{"location":"Athens, Greece","name":"MLSys: Machine Learning and Systems","end_date":"2024-04-22","start_date":"2024-04-22"},"main_file_link":[{"url":"https://proceedings.mlsys.org/paper_files/paper/2024/hash/9069a8976ff06f6443e7f4172990a580-Abstract-Conference.html","open_access":"1"}],"oa_version":"Published Version","day":"01","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","type":"conference","date_updated":"2026-04-07T13:00:54Z","_id":"17456","month":"04","title":"L-GreCo: Layerwise-adaptive gradient compression for efficient data-parallel deep learning","corr_author":"1","status":"public","article_processing_charge":"No","date_published":"2024-04-01T00:00:00Z","date_created":"2024-08-22T08:29:25Z","external_id":{"arxiv":["2210.17357"]},"related_material":{"record":[{"id":"17490","status":"public","relation":"dissertation_contains"}]},"editor":[{"first_name":"P.","full_name":"Gibbons, P.","last_name":"Gibbons"},{"last_name":"Pekhimenko","first_name":"G.","full_name":"Pekhimenko, G."},{"full_name":"De Sa, C.","first_name":"C.","last_name":"De Sa"}],"abstract":[{"lang":"eng","text":"Data-parallel distributed training of deep neural networks (DNN) has gained very widespread adoption, but can still experience communication bottlenecks. To address this issue, entire families of compression mechanisms have been developed, including quantization, sparsification, and low-rank approximation, some of which are seeing significant practical adoption. Despite this progress, almost all known compression schemes apply compression uniformly across DNN layers, although layers are heterogeneous in terms of parameter count and their impact on model accuracy.In this work, we provide a general framework for adapting the degree of compression across the model's layers dynamically during training, improving the overall compression, while leading to substantial speedups, without sacrificing accuracy. Our framework, called L-GreCo, is based on an adaptive algorithm, which automatically picks the optimal compression parameters for model layers guaranteeing the best compression ratio while satisfying an error constraint. Extensive experiments over image classification and language modeling tasks shows that L-GreCo is effective across all existing families of compression methods, and achieves up to 2.5\r\n×\r\n training speedup and up to 5\r\n×\r\n compression improvement over efficient implementations of existing approaches, while recovering full accuracy. Moreover, L-GreCo is complementary to existing adaptive algorithms, improving their compression ratio by 50\\% and practical throughput by 66\\%. An anonymized implementation is available at https://github.com/LGrCo/L-GreCo."}],"author":[{"full_name":"Markov, Ilia","first_name":"Ilia","id":"D0CF4148-C985-11E9-8066-0BDEE5697425","last_name":"Markov"},{"last_name":"Alimohammadi","first_name":"Kaveh","full_name":"Alimohammadi, Kaveh"},{"first_name":"Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","full_name":"Frantar, Elias","last_name":"Frantar"},{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","last_name":"Alistarh"}],"quality_controlled":"1","publication":"Proceedings of Machine Learning and Systems ","language":[{"iso":"eng"}],"year":"2024"},{"related_material":{"link":[{"url":"https://github.com/IST-DASLab/SPADE","relation":"software"}],"record":[{"relation":"dissertation_contains","status":"public","id":"21854"}]},"external_id":{"arxiv":["2310.04519"]},"article_processing_charge":"No","date_published":"2024-09-01T00:00:00Z","date_created":"2024-09-22T22:01:46Z","status":"public","abstract":[{"text":"It is known that sparsity can improve interpretability for deep neural networks. However, existing methods in the area either require networks that are pre-trained with sparsity constraints, or impose sparsity after the fact, altering the network’s general behavior. In this paper, we demonstrate, for the first time, that sparsity can instead be incorporated into the interpretation process itself, as a sample-specific preprocessing step. Unlike previous work, this approach, which we call SPADE, does not place constraints on the trained model and does not affect its behavior during inference on the sample. Given a trained model and a target sample, SPADE uses sample-targeted pruning to provide a \"trace\" of the network’s execution on the sample, reducing the network to the most important connections prior to computing an interpretation. We demonstrate that preprocessing with SPADE significantly increases the accuracy of image saliency maps across several interpretability methods. Additionally, SPADE improves the usefulness of neuron visualizations, aiding humans in reasoning about network behavior. Our code is available at https://github.com/IST-DASLab/SPADE.","lang":"eng"}],"page":"45955-45987","acknowledged_ssus":[{"_id":"ScienComp"}],"quality_controlled":"1","author":[{"full_name":"Moakhar, Arshia Soltani","first_name":"Arshia Soltani","last_name":"Moakhar"},{"id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","first_name":"Eugenia B","full_name":"Iofinova, Eugenia B","orcid":"0000-0002-7778-3221","last_name":"Iofinova"},{"last_name":"Frantar","first_name":"Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","full_name":"Frantar, Elias"},{"last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X"}],"acknowledgement":"The authors would like to thank Stephen Casper and Tony Wang for their feedback on this work, and Eldar Kurtic for his advice on aspects of the project. This research was supported by the Scientific Service Units (SSU) of IST Austria through resources provided by Scientific Computing (SciComp). EI was supported in part by the FWF DK VGSCO, grant agreement number W1260-N35.","publication_identifier":{"eissn":["2640-3498"]},"language":[{"iso":"eng"}],"year":"2024","publication":"Proceedings of the 41st International Conference on Machine Learning","intvolume":"       235","arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"}],"publisher":"ML Research Press","oa":1,"citation":{"ieee":"A. S. Moakhar, E. B. Iofinova, E. Frantar, and D.-A. Alistarh, “SPADE: Sparsity-guided debugging for deep neural networks,” in <i>Proceedings of the 41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 45955–45987.","ista":"Moakhar AS, Iofinova EB, Frantar E, Alistarh D-A. 2024. SPADE: Sparsity-guided debugging for deep neural networks. Proceedings of the 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 235, 45955–45987.","apa":"Moakhar, A. S., Iofinova, E. B., Frantar, E., &#38; Alistarh, D.-A. (2024). SPADE: Sparsity-guided debugging for deep neural networks. In <i>Proceedings of the 41st International Conference on Machine Learning</i> (Vol. 235, pp. 45955–45987). Vienna, Austria: ML Research Press.","mla":"Moakhar, Arshia Soltani, et al. “SPADE: Sparsity-Guided Debugging for Deep Neural Networks.” <i>Proceedings of the 41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 45955–87.","ama":"Moakhar AS, Iofinova EB, Frantar E, Alistarh D-A. SPADE: Sparsity-guided debugging for deep neural networks. In: <i>Proceedings of the 41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:45955-45987.","short":"A.S. Moakhar, E.B. Iofinova, E. Frantar, D.-A. Alistarh, in:, Proceedings of the 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 45955–45987.","chicago":"Moakhar, Arshia Soltani, Eugenia B Iofinova, Elias Frantar, and Dan-Adrian Alistarh. “SPADE: Sparsity-Guided Debugging for Deep Neural Networks.” In <i>Proceedings of the 41st International Conference on Machine Learning</i>, 235:45955–87. ML Research Press, 2024."},"main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2310.04519"}],"oa_version":"Preprint","day":"01","conference":{"start_date":"2024-07-21","end_date":"2024-07-27","name":"ICML: International Conference on Machine Learning","location":"Vienna, Austria"},"project":[{"_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","name":"Vienna Graduate School on Computational Optimization","grant_number":"W1260-N35"}],"volume":235,"scopus_import":"1","type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","alternative_title":["PMLR"],"corr_author":"1","title":"SPADE: Sparsity-guided debugging for deep neural networks","month":"09","date_updated":"2026-05-19T11:20:27Z","_id":"18121"},{"oa":1,"citation":{"chicago":"Shevchenko, Alexander. “High-Dimensional Limits in Artificial Neural Networks.” Institute of Science and Technology Austria, 2024. <a href=\"https://doi.org/10.15479/at:ista:17465\">https://doi.org/10.15479/at:ista:17465</a>.","short":"A. Shevchenko, High-Dimensional Limits in Artificial Neural Networks, Institute of Science and Technology Austria, 2024.","ama":"Shevchenko A. High-dimensional limits in artificial neural networks. 2024. doi:<a href=\"https://doi.org/10.15479/at:ista:17465\">10.15479/at:ista:17465</a>","ieee":"A. Shevchenko, “High-dimensional limits in artificial neural networks,” Institute of Science and Technology Austria, 2024.","ista":"Shevchenko A. 2024. High-dimensional limits in artificial neural networks. Institute of Science and Technology Austria.","apa":"Shevchenko, A. (2024). <i>High-dimensional limits in artificial neural networks</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/at:ista:17465\">https://doi.org/10.15479/at:ista:17465</a>","mla":"Shevchenko, Alexander. <i>High-Dimensional Limits in Artificial Neural Networks</i>. Institute of Science and Technology Austria, 2024, doi:<a href=\"https://doi.org/10.15479/at:ista:17465\">10.15479/at:ista:17465</a>."},"publisher":"Institute of Science and Technology Austria","publication_status":"published","department":[{"_id":"GradSch"},{"_id":"DaAl"},{"_id":"MaMo"}],"has_accepted_license":"1","project":[{"name":"Prix Lopez-Loretta 2019 - Marco Mondelli","_id":"059876FA-7A3F-11EA-A408-12923DDC885E"},{"_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","grant_number":"W1260-N35","name":"Vienna Graduate School on Computational Optimization"}],"day":"29","oa_version":"Published Version","doi":"10.15479/at:ista:17465","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","type":"dissertation","date_updated":"2025-04-25T10:32:06Z","_id":"17465","month":"08","title":"High-dimensional limits in artificial neural networks","ddc":["519"],"alternative_title":["ISTA Thesis"],"corr_author":"1","status":"public","OA_place":"repository","article_processing_charge":"No","date_created":"2024-08-28T15:14:25Z","date_published":"2024-08-29T00:00:00Z","related_material":{"record":[{"status":"public","id":"11420","relation":"part_of_dissertation"},{"id":"17469","status":"public","relation":"part_of_dissertation"},{"id":"14459","status":"public","relation":"part_of_dissertation"},{"relation":"part_of_dissertation","id":"9198","status":"public"}]},"abstract":[{"text":"In the modern age of machine learning, artificial neural networks have become an integral part\r\nof many practical systems. One of the key ingredients of the success of the deep learning\r\napproach is recent computational advances which allowed the training of models with billions\r\nof parameters on large-scale data. Such over-parameterized and data-hungry regimes pose a\r\nchallenge for the theoretical analysis of modern models since “classical” statistical wisdom\r\nis no longer applicable. In this view, it is paramount to extend or develop new machinery\r\nthat will allow tackling the neural network analysis under new challenging asymptotic regimes,\r\nwhich is the focus of this thesis.\r\nLarge neural network systems are usually optimized via “local” search algorithms, such\r\nas stochastic gradient descent (SGD). However, given the high-dimensional nature of the\r\nparameter space, it is a priori not clear why such a crude “local” approach works so remarkably\r\nwell in practice. We take a step towards demystifying this phenomenon by showing that\r\nthe landscape of the SGD training dynamics exhibits a few beneficial properties for the\r\noptimization. First, we show that along the SGD trajectory an over-parameterized network\r\nis dropout stable. The emergence of dropout stability allows to conclude that the minima\r\nfound by SGD are connected via a continuous path of small loss. This in turn means that\r\nthe high-dimensional landscape of the neural network optimization problem is provably not so\r\nunfavourable to gradient-based training, due to mode connectivity. Next, we show that SGD\r\nfor an over-parameterized network tends to find solutions that are functionally more “simple”.\r\nThis in turn means that the SGD minima are more robust, since a less complicated solution\r\nwill less likely overfit the data. More formally, for a prototypical example of a wide two-layer\r\nReLU network on a 1d regression task we show that the SGD algorithm is implicitly selective in\r\nits choice of an interpolating solution. Namely, at convergence the neural network implements\r\na piece-wise linear function with the number of linear regions depending only on the amount\r\nof training data. This is in contrast to a “smooth”-like behaviour which one would expect\r\ngiven such a severe over-parameterization of the model.\r\nDiverging from the generic supervised setting of classification and regression problems, we\r\nanalyze an auto-encoder model that is commonly used for representation learning and data\r\ncompression. Despite the wide applicability of the auto-encoding paradigm, the theoretical\r\nunderstanding of their behaviour is limited even in the simplistic shallow case. The related\r\nwork is restricted to extreme asymptotic regimes in which the auto-encoder is either severely\r\nover-parameterized or under-parameterized. In contrast, we provide a tight characterization\r\nfor the 1-bit compression of Gaussian signals in the challenging proportional regime, i.e., the\r\ninput dimension and the size of the compressed representation obey the same asymptotics.\r\nWe also show that gradient-based methods are able to find a globally optimal solution and\r\nthat the predictions made for Gaussian data extrapolate beyond - to the case of compression\r\nof natural images. Next, we relax the Gaussian assumption and study more structured input\r\nsources. We show that the shallow model is sometimes agnostic to the structure of the data\r\nvii\r\nwhich results in a Gaussian-like behaviour. We prove that making the decoding component\r\nslightly less shallow is already enough to escape the “curse” of Gaussian performance.\r\n","lang":"eng"}],"page":"232","file":[{"content_type":"application/pdf","access_level":"open_access","creator":"ashevche","file_id":"17482","file_size":4468610,"date_updated":"2024-10-05T22:30:05Z","embargo":"2024-10-04","relation":"main_file","date_created":"2024-09-02T09:23:32Z","checksum":"da6dd3166078934577f6af93d27000e2","file_name":"thesis_a2b.pdf"},{"embargo_to":"open_access","relation":"source_file","date_created":"2024-09-02T09:23:46Z","file_name":"Thesis Alex - ISTA.zip","checksum":"76a39ef252239560923cdda4ce0a31a4","content_type":"application/zip","access_level":"closed","creator":"ashevche","file_id":"17483","date_updated":"2024-10-05T22:30:05Z","file_size":15930999}],"degree_awarded":"PhD","author":[{"full_name":"Shevchenko, Aleksandr","first_name":"Aleksandr","id":"F2B06EC2-C99E-11E9-89F0-752EE6697425","last_name":"Shevchenko"}],"acknowledged_ssus":[{"_id":"ScienComp"}],"language":[{"iso":"eng"}],"year":"2024","supervisor":[{"orcid":"0000-0002-3242-7020","full_name":"Mondelli, Marco","first_name":"Marco","id":"27EB676C-8706-11E9-9510-7717E6697425","last_name":"Mondelli"},{"orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"publication_identifier":{"issn":["2663-337X"]},"file_date_updated":"2024-10-05T22:30:05Z"}]
