[{"supervisor":[{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"publication_identifier":{"issn":["2663-337X"]},"year":"2026","language":[{"iso":"eng"}],"file_date_updated":"2026-05-13T13:10:48Z","acknowledgement":"The research in this Ph.D. was funded in whole\r\nor in part by the Austrian Science Fund (FWF) W1260-N35 (Vienna Graduate School for\r\nComputational Optimization). For open access purposes the author has applied a CC BY\r\npublic copyright license to any author accepted manuscript version arising from this submission\r\nwherever possible. Additionally, I am grateful to Alois Schlögl, Waleed Khalid, and the rest of\r\nthe ISTA Scientific Computing team for building and maintaining the infrastructure I used\r\nto run experiments. I’m also deeply grateful to the Alistarh group’s administrative assistant,\r\nChristine Francois, who always deals with our nonsense with common sense and a smile.\r\n","author":[{"last_name":"Iofinova","orcid":"0000-0002-7778-3221","full_name":"Iofinova, Eugenia B","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","first_name":"Eugenia B"}],"acknowledged_ssus":[{"_id":"ScienComp"}],"file":[{"file_id":"21856","date_updated":"2026-05-11T08:36:01Z","file_size":28479571,"content_type":"application/zip","creator":"eiofinov","access_level":"closed","date_created":"2026-05-11T08:36:01Z","file_name":"EIofinova_thesis_FinalVersion.zip","checksum":"2e148dad920e3f9b7c32796e0ba2e5f7","relation":"source_file"},{"content_type":"application/pdf","creator":"eiofinov","access_level":"open_access","success":1,"file_id":"21877","file_size":18137757,"date_updated":"2026-05-13T13:10:48Z","relation":"main_file","date_created":"2026-05-13T13:10:48Z","checksum":"b10c2933f386f532b2dbf28b19c5525c","file_name":"2026_Iofinova_Eugenia_Thesis.pdf"}],"abstract":[{"lang":"eng","text":"As neural-network-based models grow both in size and popularity, interest has grown in making the models smaller and more efficient to train. To that end, many methods have been proposed to prune models by reducing their number of nonzero parameters. Additionally, parameter-efficient fine-tuning, in which a much smaller number of parameters than the total contained in the model is updated during training, has become very popular, especially in the space of Large Language Models. At the same time, the increasingly routine deployment of machine learning in real-world applications has spurred a drive to make them more trustworthy - in the sense of, among other things, being unbiased, interpretable, and editable. In this thesis, we examine the interplay between efficiency and trustworthiness.\r\n\r\nFirst, we analyze the effects of model pruning on bias in computer vision models, demonstrating that increased sparsity leads to greater bias, largely as a function of increased model uncertainty in marginal cases. Based on this observation, we propose several bias mitigation techniques. Then, we demonstrate that example-specific model pruning can improve model interpretation methods while improving pruning efficiency to make example-specific model pruning feasible in real time. Then, we investigate the effectiveness of parameter-efficient and data-efficient model personalization via fine-tuning, demonstrating that it is highly feasible with very small computational and data resources. Finally, we consider efficiency in editing model knowledge using a custom synthetic data framework, demonstrating that parameter-efficient, low-rank fine-tuning frequently outperforms full-rank fine-tuning, and, additionally, that restricting which model blocks are fine-tuned frequently improves results. Together, the results in this thesis provide new insights and techniques for combining trustworthiness and efficiency during neural network inference and training.\r\n\r\n-----------------“In reference to IEEE copyrighted material which is used with permission in this thesis, the IEEE does not endorse any of [name of university or educational entity]’s products or services. Internal or personal use of this material is permitted. If interested in reprinting/republishing IEEE copyrighted material for advertising or promotional purposes or for creating new collective works for resale or redistribution, please go to http://www.ieee.org/publications_standards/publications/rights/rights_link.html to learn how to obtain a License from RightsLink. If applicable, University Microfilms and/or ProQuest Library, or the Archives of Canada may supply single copies of the dissertation.”"}],"page":"237","degree_awarded":"PhD","article_processing_charge":"No","date_published":"2026-05-11T00:00:00Z","date_created":"2026-05-11T08:43:22Z","OA_place":"publisher","status":"public","related_material":{"record":[{"relation":"part_of_dissertation","id":"14771","status":"public"},{"relation":"part_of_dissertation","status":"public","id":"18121"},{"id":"21858","status":"public","relation":"part_of_dissertation"},{"relation":"part_of_dissertation","id":"21859","status":"public"},{"status":"public","id":"21857","relation":"part_of_dissertation"}]},"month":"05","date_updated":"2026-05-19T11:20:28Z","_id":"21854","alternative_title":["ISTA Thesis"],"corr_author":"1","ddc":["000"],"title":"On the utility and effects of efficiency in artificial neural networks","type":"dissertation","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","project":[{"grant_number":"W1260-N35","name":"Vienna Graduate School on Computational Optimization","_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A"}],"has_accepted_license":"1","day":"11","oa_version":"Published Version","doi":"10.15479/AT-ISTA-21854","publisher":"Institute of Science and Technology Austria","oa":1,"citation":{"chicago":"Iofinova, Eugenia B. “On the Utility and Effects of Efficiency in Artificial Neural Networks.” Institute of Science and Technology Austria, 2026. <a href=\"https://doi.org/10.15479/AT-ISTA-21854\">https://doi.org/10.15479/AT-ISTA-21854</a>.","ama":"Iofinova EB. On the utility and effects of efficiency in artificial neural networks. 2026. doi:<a href=\"https://doi.org/10.15479/AT-ISTA-21854\">10.15479/AT-ISTA-21854</a>","short":"E.B. Iofinova, On the Utility and Effects of Efficiency in Artificial Neural Networks, Institute of Science and Technology Austria, 2026.","mla":"Iofinova, Eugenia B. <i>On the Utility and Effects of Efficiency in Artificial Neural Networks</i>. Institute of Science and Technology Austria, 2026, doi:<a href=\"https://doi.org/10.15479/AT-ISTA-21854\">10.15479/AT-ISTA-21854</a>.","ieee":"E. B. Iofinova, “On the utility and effects of efficiency in artificial neural networks,” Institute of Science and Technology Austria, 2026.","apa":"Iofinova, E. B. (2026). <i>On the utility and effects of efficiency in artificial neural networks</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/AT-ISTA-21854\">https://doi.org/10.15479/AT-ISTA-21854</a>","ista":"Iofinova EB. 2026. On the utility and effects of efficiency in artificial neural networks. Institute of Science and Technology Austria."},"publication_status":"published","department":[{"_id":"GradSch"},{"_id":"DaAl"}]},{"title":"Panza: Investigating the feasibility of fully-local personalized text generation","corr_author":"1","_id":"21857","date_updated":"2026-05-19T11:20:27Z","month":"03","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"type":"conference_poster","day":"06","main_file_link":[{"url":"https://openreview.net/pdf?id=soFWnTqd23","open_access":"1"}],"oa_version":"Accepted Version","conference":{"name":"CPAL: Conference on Parsimony and Learning","location":"Tübíngen, Germany","start_date":"2026-03-23","end_date":"2026-03-26"},"department":[{"_id":"GradSch"},{"_id":"DaAl"}],"publication_status":"published","keyword":["LLMs","PEFT","LoRA","personalization","efficient ML"],"citation":{"mla":"Nicolicioiu, Armand, et al. “Panza: Investigating the Feasibility of Fully-Local Personalized Text Generation.” <i>Third Conference on Parsimony and Learning (Proceedings Track)</i>, 81, OpenReview, 2026.","apa":"Nicolicioiu, A., Iofinova, E. B., Jovanovic, A., Kurtic, E., Nikdan, M., Panferov, A., … Alistarh, D.-A. (2026). <i>Panza: Investigating the feasibility of fully-local personalized text generation</i>. <i>Third Conference on Parsimony and Learning (Proceedings Track)</i>. Tübíngen, Germany: OpenReview.","ieee":"A. Nicolicioiu <i>et al.</i>, <i>Panza: Investigating the feasibility of fully-local personalized text generation</i>. OpenReview, 2026.","ista":"Nicolicioiu A, Iofinova EB, Jovanovic A, Kurtic E, Nikdan M, Panferov A, Markov I, Shavit N, Alistarh D-A. 2026. Panza: Investigating the feasibility of fully-local personalized text generation, OpenReview,p.","chicago":"Nicolicioiu, Armand, Eugenia B Iofinova, Andrej Jovanovic, Eldar Kurtic, Mahdi Nikdan, Andrei Panferov, Ilia Markov, Nir Shavit, and Dan-Adrian Alistarh. <i>Panza: Investigating the Feasibility of Fully-Local Personalized Text Generation</i>. <i>Third Conference on Parsimony and Learning (Proceedings Track)</i>. OpenReview, 2026.","ama":"Nicolicioiu A, Iofinova EB, Jovanovic A, et al. <i>Panza: Investigating the Feasibility of Fully-Local Personalized Text Generation</i>. OpenReview; 2026.","short":"A. Nicolicioiu, E.B. Iofinova, A. Jovanovic, E. Kurtic, M. Nikdan, A. Panferov, I. Markov, N. Shavit, D.-A. Alistarh, Panza: Investigating the Feasibility of Fully-Local Personalized Text Generation, OpenReview, 2026."},"oa":1,"publisher":"OpenReview","publication":"Third Conference on Parsimony and Learning (Proceedings Track)","year":"2026","language":[{"iso":"eng"}],"author":[{"first_name":"Armand","full_name":"Nicolicioiu, Armand","last_name":"Nicolicioiu"},{"orcid":"0000-0002-7778-3221","full_name":"Iofinova, Eugenia B","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","first_name":"Eugenia B","last_name":"Iofinova"},{"full_name":"Jovanovic, Andrej","first_name":"Andrej","last_name":"Jovanovic"},{"full_name":"Kurtic, Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","first_name":"Eldar","last_name":"Kurtic"},{"first_name":"Mahdi","id":"66374281-f394-11eb-9cf6-869147deecc0","full_name":"Nikdan, Mahdi","last_name":"Nikdan"},{"last_name":"Panferov","id":"2c18daae-4dbe-11ef-8491-98ce2d960f09","first_name":"Andrei","full_name":"Panferov, Andrei"},{"id":"D0CF4148-C985-11E9-8066-0BDEE5697425","first_name":"Ilia","full_name":"Markov, Ilia","last_name":"Markov"},{"full_name":"Shavit, Nir","first_name":"Nir","last_name":"Shavit"},{"orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"article_number":"81","quality_controlled":"1","OA_type":"green","abstract":[{"lang":"eng","text":"The availability of powerful open-source large language models (LLMs) opens exciting use cases, such as using personal data to fine-tune these models to imitate a user’s unique writing style. Two key requirements for this functionality are personalization–in the sense that the output should recognizably reflect the user’s own writing style—and privacy–users may justifiably be wary of uploading extremely personal data, such as their email archive, to a third-party service. In this paper, we demonstrate the feasibility of training and running such an assistant, which we call Panza, on commodity hardware, for the specific use case of email generation. Panza’s personalization features are based on a combination of parameter-efficient fine-tuning using a variant of the Reverse Instructions technique [1] and Retrieval-Augmented Generation (RAG) [2]. We demonstrate that this combination allows us to fine-tune an LLM to reflect a user’s writing style using limited data, while executing on extremely limited resources, e.g. on a free Google Colab instance. Our key methodological contribution is the first detailed study of evaluation metrics for this task, and\r\nof how different choices of system components–the use of RAG and of different fine-tuning approaches–impact the system’s performance. Additionally, we demonstrate that very little data - under 100 email samples - are sufficient to create models that convincingly imitate humans, showcasing a previously unknown attack vector in language models. We are releasing the full Panza code as well as three new email datasets licensed for research use."}],"related_material":{"record":[{"id":"21854","status":"public","relation":"dissertation_contains"}]},"status":"public","OA_place":"publisher","date_created":"2026-05-11T08:50:28Z","date_published":"2026-03-06T00:00:00Z","article_processing_charge":"No"},{"month":"01","date_updated":"2026-05-19T11:20:27Z","_id":"21859","corr_author":"1","title":"Behemoth: Benchmarking unlearning in LLMs using fully synthetic data","type":"preprint","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","project":[{"_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","name":"Vienna Graduate School on Computational Optimization","grant_number":"W1260-N35"}],"main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2601.23153","open_access":"1"}],"day":"30","oa_version":"Preprint","doi":"10.48550/arXiv.2601.23153","oa":1,"citation":{"chicago":"Iofinova, Eugenia B, and Dan-Adrian Alistarh. “Behemoth: Benchmarking Unlearning in LLMs Using Fully Synthetic Data.” <i>ArXiv</i>, n.d. <a href=\"https://doi.org/10.48550/arXiv.2601.23153\">https://doi.org/10.48550/arXiv.2601.23153</a>.","ama":"Iofinova EB, Alistarh D-A. Behemoth: Benchmarking unlearning in LLMs using fully synthetic data. <i>arXiv</i>. doi:<a href=\"https://doi.org/10.48550/arXiv.2601.23153\">10.48550/arXiv.2601.23153</a>","short":"E.B. Iofinova, D.-A. Alistarh, ArXiv (n.d.).","ista":"Iofinova EB, Alistarh D-A. Behemoth: Benchmarking unlearning in LLMs using fully synthetic data. arXiv, <a href=\"https://doi.org/10.48550/arXiv.2601.23153\">10.48550/arXiv.2601.23153</a>.","ieee":"E. B. Iofinova and D.-A. Alistarh, “Behemoth: Benchmarking unlearning in LLMs using fully synthetic data,” <i>arXiv</i>. .","apa":"Iofinova, E. B., &#38; Alistarh, D.-A. (n.d.). Behemoth: Benchmarking unlearning in LLMs using fully synthetic data. <i>arXiv</i>. <a href=\"https://doi.org/10.48550/arXiv.2601.23153\">https://doi.org/10.48550/arXiv.2601.23153</a>","mla":"Iofinova, Eugenia B., and Dan-Adrian Alistarh. “Behemoth: Benchmarking Unlearning in LLMs Using Fully Synthetic Data.” <i>ArXiv</i>, doi:<a href=\"https://doi.org/10.48550/arXiv.2601.23153\">10.48550/arXiv.2601.23153</a>."},"arxiv":1,"publication_status":"draft","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"year":"2026","language":[{"iso":"eng"}],"publication":"arXiv","acknowledgement":"EI thanks Weiwei Yang, Janardhan Kulkani, and Kate Lytvynets for their advice and support in\r\ndeveloping an earlier version of the Behemoth library. This research was supported by the Scientific\r\nService Units (SSU) of IST Austria through resources provided by Scientific Computing (SciComp).\r\nEI was supported in part by the FWF DK VGSCO, grant agreement number W1260-N35.\r\n","author":[{"last_name":"Iofinova","first_name":"Eugenia B","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","orcid":"0000-0002-7778-3221","full_name":"Iofinova, Eugenia B"},{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"acknowledged_ssus":[{"_id":"ScienComp"}],"abstract":[{"text":"As artificial neural networks, and specifically large language models, have improved rapidly in capabilities and quality, they have increasingly been deployed in real-world applications, from customer service to Google search, despite the fact that they frequently make factually incorrect or undesirable statements. This trend has inspired practical and academic interest in model editing, that is, in adjusting the weights of the model to modify its likely outputs for queries relating to a specific fact or set of facts. This may be done either to amend a fact or set of facts, for instance, to fix a frequent error in the training data, or to suppress a fact or set of facts entirely, for instance, in case of dangerous knowledge. Multiple methods have been proposed to do such edits. However, at the same time, it has been shown that such model editing can be brittle and incomplete. Moreover the effectiveness of any model editing method necessarily depends on the data on which the model is trained, and, therefore, a good understanding of the interaction of the training data distribution and the way it is stored in the network is necessary and helpful to reliably perform model editing. However, working with large language models trained on real-world data does not allow us to understand this relationship or fully measure the effects of model editing. We therefore propose Behemoth, a fully synthetic data generation framework. To demonstrate the practical insights from the framework, we explore model editing in the context of simple tabular data, demonstrating surprising findings that, in some cases, echo real-world results, for instance, that in some cases restricting the update rank results in a more effective update.","lang":"eng"}],"OA_type":"green","article_processing_charge":"No","date_published":"2026-01-30T00:00:00Z","date_created":"2026-05-11T08:58:07Z","status":"public","OA_place":"repository","related_material":{"record":[{"id":"21854","status":"public","relation":"dissertation_contains"}]},"external_id":{"arxiv":["2601.23153"]}},{"tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"user_id":"317138e5-6ab7-11ef-aa6d-ffef3953e345","type":"conference","date_updated":"2025-09-30T13:41:57Z","_id":"19877","month":"02","ddc":["000"],"title":"MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models","corr_author":"1","oa":1,"citation":{"ama":"Frantar E, Castro RL, Chen J, Hoefler T, Alistarh D-A. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. In: <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i>. Association for Computing Machinery; 2025:239-251. doi:<a href=\"https://doi.org/10.1145/3710848.3710871\">10.1145/3710848.3710871</a>","short":"E. Frantar, R.L. Castro, J. Chen, T. Hoefler, D.-A. Alistarh, in:, Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, Association for Computing Machinery, 2025, pp. 239–251.","chicago":"Frantar, Elias, Roberto L. Castro, Jiale Chen, Torsten Hoefler, and Dan-Adrian Alistarh. “MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models.” In <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i>, 239–51. Association for Computing Machinery, 2025. <a href=\"https://doi.org/10.1145/3710848.3710871\">https://doi.org/10.1145/3710848.3710871</a>.","mla":"Frantar, Elias, et al. “MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models.” <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i>, Association for Computing Machinery, 2025, pp. 239–51, doi:<a href=\"https://doi.org/10.1145/3710848.3710871\">10.1145/3710848.3710871</a>.","ista":"Frantar E, Castro RL, Chen J, Hoefler T, Alistarh D-A. 2025. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming. PPoPP: Symposium on Principles and Practice of Parallel Programming, 239–251.","ieee":"E. Frantar, R. L. Castro, J. Chen, T. Hoefler, and D.-A. Alistarh, “MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models,” in <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i>, Las Vegas, NV, United States, 2025, pp. 239–251.","apa":"Frantar, E., Castro, R. L., Chen, J., Hoefler, T., &#38; Alistarh, D.-A. (2025). MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. In <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i> (pp. 239–251). Las Vegas, NV, United States: Association for Computing Machinery. <a href=\"https://doi.org/10.1145/3710848.3710871\">https://doi.org/10.1145/3710848.3710871</a>"},"publisher":"Association for Computing Machinery","arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"}],"scopus_import":"1","has_accepted_license":"1","conference":{"end_date":"2025-03-05","start_date":"2025-03-01","name":"PPoPP: Symposium on Principles and Practice of Parallel Programming","location":"Las Vegas, NV, United States"},"day":"28","oa_version":"Published Version","doi":"10.1145/3710848.3710871","author":[{"last_name":"Frantar","full_name":"Frantar, Elias","first_name":"Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f"},{"first_name":"Roberto L.","full_name":"Castro, Roberto L.","last_name":"Castro"},{"orcid":"0000-0001-5337-5875","full_name":"Chen, Jiale","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785","first_name":"Jiale","last_name":"Chen"},{"first_name":"Torsten","full_name":"Hoefler, Torsten","last_name":"Hoefler"},{"orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"quality_controlled":"1","language":[{"iso":"eng"}],"publication":"Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","isi":1,"year":"2025","publication_identifier":{"isbn":["9798400714436"]},"file_date_updated":"2025-06-24T06:04:17Z","acknowledgement":"The authors would like to thank the Neural Magic team, in particular Michael Goin, Alexander Matveev, and Rob Shaw, for support with the vLLM integration. This research was supported in part by generous grants from NVIDIA and Google.","status":"public","OA_place":"publisher","article_processing_charge":"Yes (via OA deal)","date_published":"2025-02-28T00:00:00Z","date_created":"2025-06-23T13:51:58Z","external_id":{"arxiv":["2408.11743"],"isi":["001437826500019"]},"related_material":{"record":[{"status":"public","id":"19884","relation":"software"}]},"abstract":[{"lang":"eng","text":"As inference on Large Language Models (LLMs) emerges as an important workload in machine learning applications, model weight quantization has become a standard technique for efficient GPU deployment. Quantization not only reduces model size, but has also been shown to yield substantial speedups for single-user inference, due to reduced memory movement, with low accuracy impact. Yet, it remains a key open question whether speedups are achievable also in batched settings with multiple parallel clients, which are highly relevant for practical serving. It is unclear whether GPU kernels can be designed to remain practically memory-bound, while supporting the substantially increased compute requirements of batched workloads.\r\nIn this paper, we resolve this question positively by introducing a new design for Mixed-precision Auto-Regressive LINear kernels, called MARLIN. Concretely, given a model whose weights are compressed via quantization to, e.g., 4 bits per element, MARLIN shows that batchsizes up to 16-32 can be practically supported with close to maximum (4×) quantization speedup, and larger batchsizes up to 64-128 with gradually decreasing, but still significant, acceleration. MARLIN accomplishes this via a combination of techniques, such as asynchronous memory access, complex task scheduling and pipelining, and bespoke quantization support. Our experiments show that MARLIN's near-optimal performance on individual LLM layers across different scenarios can also lead to significant end-to-end LLM inference speedups (of up to 2.8×) when integrated with the popular vLLM open-source serving engine. Finally, we show that MARLIN is extensible to further compression techniques, like NVIDIA 2:4 sparsity, leading to additional speedups."}],"page":"239-251","OA_type":"hybrid","file":[{"file_name":"2025_PPoPP_Frantar.pdf","checksum":"a0566ea3c168e8273501a5eb7d767cf8","date_created":"2025-06-24T06:04:17Z","relation":"main_file","date_updated":"2025-06-24T06:04:17Z","file_size":1330044,"file_id":"19883","success":1,"creator":"dernst","access_level":"open_access","content_type":"application/pdf"}]},{"month":"09","date_updated":"2025-12-30T09:04:18Z","_id":"19969","corr_author":"1","title":"Near-optimal leader election in population protocols on graphs","ddc":["510"],"type":"journal_article","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","ec_funded":1,"project":[{"grant_number":"805223","name":"Elastic Coordination for Scalable Machine Learning","_id":"268A44D6-B435-11E9-9278-68D0E5697425","call_identifier":"H2020"}],"volume":38,"scopus_import":"1","has_accepted_license":"1","day":"01","oa_version":"Published Version","doi":"10.1007/s00446-025-00487-7","publisher":"Springer Nature","oa":1,"citation":{"apa":"Alistarh, D.-A., Rybicki, J., &#38; Voitovych, S. (2025). Near-optimal leader election in population protocols on graphs. <i>Distributed Computing</i>. Springer Nature. <a href=\"https://doi.org/10.1007/s00446-025-00487-7\">https://doi.org/10.1007/s00446-025-00487-7</a>","ieee":"D.-A. Alistarh, J. Rybicki, and S. Voitovych, “Near-optimal leader election in population protocols on graphs,” <i>Distributed Computing</i>, vol. 38. Springer Nature, pp. 207–245, 2025.","ista":"Alistarh D-A, Rybicki J, Voitovych S. 2025. Near-optimal leader election in population protocols on graphs. Distributed Computing. 38, 207–245.","mla":"Alistarh, Dan-Adrian, et al. “Near-Optimal Leader Election in Population Protocols on Graphs.” <i>Distributed Computing</i>, vol. 38, Springer Nature, 2025, pp. 207–45, doi:<a href=\"https://doi.org/10.1007/s00446-025-00487-7\">10.1007/s00446-025-00487-7</a>.","short":"D.-A. Alistarh, J. Rybicki, S. Voitovych, Distributed Computing 38 (2025) 207–245.","ama":"Alistarh D-A, Rybicki J, Voitovych S. Near-optimal leader election in population protocols on graphs. <i>Distributed Computing</i>. 2025;38:207-245. doi:<a href=\"https://doi.org/10.1007/s00446-025-00487-7\">10.1007/s00446-025-00487-7</a>","chicago":"Alistarh, Dan-Adrian, Joel Rybicki, and Sasha Voitovych. “Near-Optimal Leader Election in Population Protocols on Graphs.” <i>Distributed Computing</i>. Springer Nature, 2025. <a href=\"https://doi.org/10.1007/s00446-025-00487-7\">https://doi.org/10.1007/s00446-025-00487-7</a>."},"intvolume":"        38","arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"}],"publication_identifier":{"issn":["0178-2770"],"eissn":["1432-0452"]},"isi":1,"year":"2025","publication":"Distributed Computing","language":[{"iso":"eng"}],"file_date_updated":"2025-12-30T09:03:55Z","acknowledgement":"We thank all anonymous reviewers for their helpful comments. We would also like to thank Jakob Solnerzik and Olivier Stietel for catching some errors in the proofs. Open Access funding enabled and organized by Projekt DEAL. We gratefully acknowledge funding from the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement No 805223 ScaleML).","quality_controlled":"1","author":[{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh"},{"id":"334EFD2E-F248-11E8-B48F-1D18A9856A87","first_name":"Joel","orcid":"0000-0002-6432-6646","full_name":"Rybicki, Joel","last_name":"Rybicki"},{"full_name":"Voitovych, Sasha","first_name":"Sasha","last_name":"Voitovych"}],"PlanS_conform":"1","file":[{"content_type":"application/pdf","creator":"dernst","access_level":"open_access","success":1,"file_id":"20900","file_size":770705,"date_updated":"2025-12-30T09:03:55Z","relation":"main_file","date_created":"2025-12-30T09:03:55Z","checksum":"2789c0fdfb58f64930f05f6ac2b3ca61","file_name":"2025_DistributedComp_Alistarh.pdf"}],"abstract":[{"text":"In the stochastic population protocol model, we are given a connected graph with n nodes, and in every time step, a scheduler samples an edge of the graph uniformly at random and the nodes connected by this edge interact. A fundamental task in this model is stable leader election, in which all nodes start in an identical state and the aim is to reach a configuration in which (1)\r\nexactly one node is elected as leader and (2) this node remains as the unique leader no matter what sequence of interactions follows. On cliques, the complexity of this problem has recently been settled: time-optimal protocols stabilize in (n log n) expected steps using (log log n) states, whereas protocols that use O(1) states require (n2) expected steps. In this work, we investigate the complexity of stable leader election on graphs. We provide the first non-trivial time lower bounds on general graphs, showing that, when moving beyond cliques, the complexity of stable leader election can range from O(1) to (n3) expected steps. We describe a protocol that is time-optimal on many graph families, but uses polynomially-many states. In contrast, we give a near-time-optimal protocol that uses only O(log2 n) states that is at most a factor O(log n) slower. Finally, we observe that for many graphs the constant-state protocol of Beauquier et al. [OPODIS 2013] is at most a factor O(n log n) slower than the fast polynomial-state protocol, and among constant-state protocols, this protocol has near-optimal average case complexity on dense random graphs.","lang":"eng"}],"page":"207-245","OA_type":"hybrid","article_processing_charge":"Yes (via OA deal)","date_created":"2025-07-06T22:01:24Z","date_published":"2025-09-01T00:00:00Z","OA_place":"publisher","status":"public","related_material":{"record":[{"relation":"earlier_version","status":"public","id":"11844"}]},"external_id":{"isi":["001518300400001"],"arxiv":["2205.12597"]},"article_type":"original"},{"month":"04","_id":"20032","date_updated":"2025-08-04T08:03:11Z","corr_author":"1","title":"Scalable mechanistic neural networks","ddc":["000"],"type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"conference":{"location":"Singapore, Singapore","name":"ICLR: International Conference on Learning Representations","start_date":"2025-04-24","end_date":"2025-04-28"},"scopus_import":"1","has_accepted_license":"1","day":"01","oa_version":"Published Version","publisher":"ICLR","citation":{"ista":"Chen J, Yao D, Pervez AA, Alistarh D-A, Locatello F. 2025. Scalable mechanistic neural networks. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 63716–63737.","ieee":"J. Chen, D. Yao, A. A. Pervez, D.-A. Alistarh, and F. Locatello, “Scalable mechanistic neural networks,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 63716–63737.","apa":"Chen, J., Yao, D., Pervez, A. A., Alistarh, D.-A., &#38; Locatello, F. (2025). Scalable mechanistic neural networks. In <i>13th International Conference on Learning Representations</i> (pp. 63716–63737). Singapore, Singapore: ICLR.","mla":"Chen, Jiale, et al. “Scalable Mechanistic Neural Networks.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 63716–37.","chicago":"Chen, Jiale, Dingling Yao, Adeel A Pervez, Dan-Adrian Alistarh, and Francesco Locatello. “Scalable Mechanistic Neural Networks.” In <i>13th International Conference on Learning Representations</i>, 63716–37. ICLR, 2025.","short":"J. Chen, D. Yao, A.A. Pervez, D.-A. Alistarh, F. Locatello, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 63716–63737.","ama":"Chen J, Yao D, Pervez AA, Alistarh D-A, Locatello F. Scalable mechanistic neural networks. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:63716-63737."},"oa":1,"department":[{"_id":"DaAl"},{"_id":"FrLo"}],"arxiv":1,"publication_status":"published","publication_identifier":{"isbn":["9798331320850"]},"publication":"13th International Conference on Learning Representations","year":"2025","language":[{"iso":"eng"}],"file_date_updated":"2025-07-22T07:58:22Z","quality_controlled":"1","author":[{"last_name":"Chen","orcid":"0000-0001-5337-5875","full_name":"Chen, Jiale","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785","first_name":"Jiale"},{"id":"d3e02e50-48a8-11ee-8f62-c108061797fa","first_name":"Dingling","full_name":"Yao, Dingling","last_name":"Yao"},{"last_name":"Pervez","full_name":"Pervez, Adeel A","id":"fca6d90c-d47f-11ee-bc87-93ff51604981","first_name":"Adeel A"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"},{"id":"26cfd52f-2483-11ee-8040-88983bcc06d4","first_name":"Francesco","orcid":"0000-0002-4850-0683","full_name":"Locatello, Francesco","last_name":"Locatello"}],"file":[{"relation":"main_file","date_created":"2025-07-22T07:58:22Z","file_name":"2025_ICLR_Chen.pdf","checksum":"64cfdb12ae3e4e8ba57b1403e1066776","content_type":"application/pdf","success":1,"creator":"dernst","access_level":"open_access","file_id":"20065","date_updated":"2025-07-22T07:58:22Z","file_size":732745}],"OA_type":"diamond","page":"63716-63737","abstract":[{"lang":"eng","text":"We propose Scalable Mechanistic Neural Network (S-MNN), an enhanced neural network framework designed for scientific machine learning applications involving long temporal sequences. By reformulating the original Mechanistic Neural Network (MNN) (Pervez et al., 2024), we reduce the computational time and space complexities from cubic and quadratic with respect to the sequence length, respectively, to linear. This significant improvement enables efficient modeling of long-term dynamics without sacrificing accuracy or interpretability. Extensive experiments demonstrate that S-MNN matches the original MNN in precision while substantially reducing computational resources. Consequently, S-MNN can drop-in replace the original MNN in applications, providing a practical and efficient tool for integrating mechanistic bottlenecks into neural network models of complex dynamical systems. Source code is available at https://github.com/IST-DASLab/ScalableMNN."}],"date_created":"2025-07-20T22:02:01Z","date_published":"2025-04-01T00:00:00Z","article_processing_charge":"No","OA_place":"publisher","status":"public","related_material":{"link":[{"relation":"software","url":"https://github.com/IST-DASLab/ScalableMNN"}]},"external_id":{"arxiv":["2410.06074"]}},{"author":[{"last_name":"Robert","full_name":"Robert, Thomas","first_name":"Thomas"},{"last_name":"Safaryan","first_name":"Mher","id":"dd546b39-0804-11ed-9c55-ef075c39778d","full_name":"Safaryan, Mher"},{"last_name":"Modoranu","full_name":"Modoranu, Ionut-Vlad","id":"449f7a18-f128-11eb-9611-9b430c0c6333","first_name":"Ionut-Vlad"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"quality_controlled":"1","file_date_updated":"2025-08-04T08:39:51Z","year":"2025","publication":"13th International Conference on Learning Representations","language":[{"iso":"eng"}],"publication_identifier":{"isbn":["9798331320850"]},"external_id":{"arxiv":["2410.16103"]},"related_material":{"link":[{"relation":"software","url":"https://github.com/IST-DASLab/LDAdam"}]},"OA_place":"publisher","status":"public","article_processing_charge":"No","date_created":"2025-07-20T22:02:02Z","date_published":"2025-04-01T00:00:00Z","abstract":[{"text":"We introduce LDAdam, a memory-efficient optimizer for training large models, that performs adaptive optimization steps within lower dimensional subspaces, while consistently exploring the full parameter space during training. This strategy keeps the optimizer's memory footprint to a fraction of the model size. LDAdam relies on a new projection-aware update rule for the optimizer states that allows for transitioning between subspaces, i.e., estimation of the statistics of the projected gradients. To mitigate the errors due to low-rank projection, LDAdam integrates a new generalized error feedback mechanism, which explicitly accounts for both gradient and optimizer state compression. We prove the convergence of LDAdam under standard assumptions, and provide empirical evidence that LDAdam allows for efficient fine-tuning and pre-training of language models.","lang":"eng"}],"page":"101877-101913","OA_type":"diamond","file":[{"file_size":1346111,"date_updated":"2025-08-04T08:39:51Z","file_id":"20113","access_level":"open_access","creator":"dernst","success":1,"content_type":"application/pdf","file_name":"2025_ICLR_Robert.pdf","checksum":"9327d82569358d7bf1c3ec1a9952e721","date_created":"2025-08-04T08:39:51Z","relation":"main_file"}],"tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","type":"conference","title":"LDAdam: Adaptive optimization from low-dimensional gradient statistics","ddc":["000"],"corr_author":"1","date_updated":"2025-08-04T08:41:10Z","_id":"20034","month":"04","publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"}],"oa":1,"citation":{"short":"T. Robert, M. Safaryan, I.-V. Modoranu, D.-A. Alistarh, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 101877–101913.","ama":"Robert T, Safaryan M, Modoranu I-V, Alistarh D-A. LDAdam: Adaptive optimization from low-dimensional gradient statistics. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:101877-101913.","chicago":"Robert, Thomas, Mher Safaryan, Ionut-Vlad Modoranu, and Dan-Adrian Alistarh. “LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics.” In <i>13th International Conference on Learning Representations</i>, 101877–913. ICLR, 2025.","ieee":"T. Robert, M. Safaryan, I.-V. Modoranu, and D.-A. Alistarh, “LDAdam: Adaptive optimization from low-dimensional gradient statistics,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 101877–101913.","ista":"Robert T, Safaryan M, Modoranu I-V, Alistarh D-A. 2025. LDAdam: Adaptive optimization from low-dimensional gradient statistics. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 101877–101913.","apa":"Robert, T., Safaryan, M., Modoranu, I.-V., &#38; Alistarh, D.-A. (2025). LDAdam: Adaptive optimization from low-dimensional gradient statistics. In <i>13th International Conference on Learning Representations</i> (pp. 101877–101913). Singapore, Singapore: ICLR.","mla":"Robert, Thomas, et al. “LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 101877–913."},"publisher":"ICLR","oa_version":"Published Version","day":"01","has_accepted_license":"1","scopus_import":"1","conference":{"name":"ICLR: International Conference on Learning Representations","location":"Singapore, Singapore","end_date":"2025-04-28","start_date":"2025-04-24"}},{"citation":{"chicago":"Sawmya, Shashata, Linghao Kong, Ilia Markov, Dan-Adrian Alistarh, and Nir Shavit. “Wasserstein Distances, Neuronal Entanglement, and Sparsity.” In <i>13th International Conference on Learning Representations</i>, 26244–74. ICLR, 2025.","short":"S. Sawmya, L. Kong, I. Markov, D.-A. Alistarh, N. Shavit, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 26244–26274.","ama":"Sawmya S, Kong L, Markov I, Alistarh D-A, Shavit N. Wasserstein distances, neuronal entanglement, and sparsity. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:26244-26274.","apa":"Sawmya, S., Kong, L., Markov, I., Alistarh, D.-A., &#38; Shavit, N. (2025). Wasserstein distances, neuronal entanglement, and sparsity. In <i>13th International Conference on Learning Representations</i> (pp. 26244–26274). Singapore, Singapore: ICLR.","ista":"Sawmya S, Kong L, Markov I, Alistarh D-A, Shavit N. 2025. Wasserstein distances, neuronal entanglement, and sparsity. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 26244–26274.","ieee":"S. Sawmya, L. Kong, I. Markov, D.-A. Alistarh, and N. Shavit, “Wasserstein distances, neuronal entanglement, and sparsity,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 26244–26274.","mla":"Sawmya, Shashata, et al. “Wasserstein Distances, Neuronal Entanglement, and Sparsity.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 26244–74."},"oa":1,"publisher":"ICLR","department":[{"_id":"DaAl"}],"arxiv":1,"publication_status":"published","has_accepted_license":"1","scopus_import":"1","conference":{"name":"ICLR: International Conference on Learning Representations","location":"Singapore, Singapore","end_date":"2025-04-28","start_date":"2025-04-24"},"oa_version":"Published Version","day":"01","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"type":"conference","_id":"20037","date_updated":"2025-08-04T08:16:43Z","month":"04","title":"Wasserstein distances, neuronal entanglement, and sparsity","ddc":["000"],"corr_author":"1","status":"public","OA_place":"publisher","date_published":"2025-04-01T00:00:00Z","date_created":"2025-07-20T22:02:03Z","article_processing_charge":"No","external_id":{"arxiv":["2405.15756"]},"related_material":{"link":[{"relation":"software","url":"https://github.com/Shavit-Lab/Sparse-Expansion"}]},"OA_type":"diamond","page":"26244-26274","abstract":[{"text":"Disentangling polysemantic neurons is at the core of many current approaches to interpretability of large language models. Here we attempt to study how disentanglement can be used to understand performance, particularly under weight sparsity, a leading post-training optimization technique. We suggest a novel measure for estimating neuronal entanglement: the Wasserstein distance of a neuron's output distribution to a Gaussian. Moreover, we show the existence of a small number of highly entangled \"Wasserstein Neurons\" in each linear layer of an LLM, characterized by their highly non-Gaussian output distributions, their role in mapping similar inputs to dissimilar outputs, and their significant impact on model accuracy. To study these phenomena, we propose a new experimental framework for disentangling polysemantic neurons. Our framework separates each layer's inputs to create a mixture of experts where each neuron's output is computed by a mixture of neurons of lower Wasserstein distance, each better at maintaining accuracy when sparsified without retraining. We provide strong evidence that this is because the mixture of sparse experts is effectively disentangling the input-output relationship of individual neurons, in particular the difficult Wasserstein neurons.","lang":"eng"}],"file":[{"file_size":5447177,"date_updated":"2025-08-04T08:14:09Z","file_id":"20110","access_level":"open_access","creator":"dernst","success":1,"content_type":"application/pdf","file_name":"2025_ICLR_Sawmya.pdf","checksum":"39a8fa7dbdd7029859e156f53f20f6bc","date_created":"2025-08-04T08:14:09Z","relation":"main_file"}],"author":[{"last_name":"Sawmya","full_name":"Sawmya, Shashata","first_name":"Shashata"},{"first_name":"Linghao","full_name":"Kong, Linghao","last_name":"Kong"},{"first_name":"Ilia","id":"D0CF4148-C985-11E9-8066-0BDEE5697425","full_name":"Markov, Ilia","last_name":"Markov"},{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"},{"full_name":"Shavit, Nir","first_name":"Nir","last_name":"Shavit"}],"quality_controlled":"1","publication":"13th International Conference on Learning Representations","year":"2025","language":[{"iso":"eng"}],"publication_identifier":{"isbn":["9798331320850"]},"file_date_updated":"2025-08-04T08:14:09Z","acknowledgement":"The authors would like to extend their gratitude to Lori Leu for her insightful comments on the\r\napplication of the Wasserstein distance metric. We also wish to thank Elias Frantar for his help in\r\nworking with the SparseGPT implementation and his advice for the project. Additionally, we would like to thank Tony Tong Wang and Thomas Athey for their valuable feedback and constructive discussions.\r\nThis work was supported by an NIH Brains CONNECTS U01 grant and AMD’s AI & HPC Fund."},{"type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"title":"The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws","ddc":["000"],"month":"04","_id":"20038","date_updated":"2025-08-04T08:24:59Z","department":[{"_id":"DaAl"}],"arxiv":1,"publication_status":"published","publisher":"ICLR","citation":{"chicago":"Jin, Tian, Ahmed Imtiaz Humayun, Utku Evci, Suvinay Subramanian, Amir Yazdanbakhsh, Dan-Adrian Alistarh, and Gintare Karolina Dziugaite. “The Journey Matters: Average Parameter Count over Pre-Training Unifies Sparse and Dense Scaling Laws.” In <i>13th International Conference on Learning Representations</i>, 85165–81. ICLR, 2025.","short":"T. Jin, A.I. Humayun, U. Evci, S. Subramanian, A. Yazdanbakhsh, D.-A. Alistarh, G.K. Dziugaite, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 85165–85181.","ama":"Jin T, Humayun AI, Evci U, et al. The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:85165-85181.","mla":"Jin, Tian, et al. “The Journey Matters: Average Parameter Count over Pre-Training Unifies Sparse and Dense Scaling Laws.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 85165–81.","ieee":"T. Jin <i>et al.</i>, “The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 85165–85181.","apa":"Jin, T., Humayun, A. I., Evci, U., Subramanian, S., Yazdanbakhsh, A., Alistarh, D.-A., &#38; Dziugaite, G. K. (2025). The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. In <i>13th International Conference on Learning Representations</i> (pp. 85165–85181). Singapore, Singapore: ICLR.","ista":"Jin T, Humayun AI, Evci U, Subramanian S, Yazdanbakhsh A, Alistarh D-A, Dziugaite GK. 2025. The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 85165–85181."},"oa":1,"day":"01","oa_version":"Published Version","conference":{"end_date":"2025-04-28","start_date":"2025-04-24","location":"Singapore, Singapore","name":"ICLR: International Conference on Learning Representations"},"has_accepted_license":"1","scopus_import":"1","quality_controlled":"1","author":[{"last_name":"Jin","first_name":"Tian","full_name":"Jin, Tian"},{"last_name":"Humayun","first_name":"Ahmed Imtiaz","full_name":"Humayun, Ahmed Imtiaz"},{"full_name":"Evci, Utku","first_name":"Utku","last_name":"Evci"},{"full_name":"Subramanian, Suvinay","first_name":"Suvinay","last_name":"Subramanian"},{"last_name":"Yazdanbakhsh","first_name":"Amir","full_name":"Yazdanbakhsh, Amir"},{"orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","last_name":"Alistarh"},{"last_name":"Dziugaite","first_name":"Gintare Karolina","full_name":"Dziugaite, Gintare Karolina"}],"file_date_updated":"2025-08-04T08:23:47Z","acknowledgement":"We are deeply grateful to Elias Frantar, Naveen Kumar, Sanjiv Kumar, Daniel\r\nM. Roy, and Clemens Schaefer for their valuable feedback and thoughtful review of this paper.\r\nWe also acknowledge the critical support provided by the Google CoreML Performance Team, and Google Research during this project. We further recognize the extended team at Google DeepMind, who enabled and supported this research direction.\r\nThis work was in part supported by the Sloan Foundation, the MIT-IBM Watson AI Lab, Apple, and SRC JUMP 2.0 (CoCoSys).","publication_identifier":{"isbn":["9798331320850"]},"language":[{"iso":"eng"}],"year":"2025","publication":"13th International Conference on Learning Representations","external_id":{"arxiv":["2501.12486 "]},"date_created":"2025-07-20T22:02:03Z","date_published":"2025-04-01T00:00:00Z","article_processing_charge":"No","status":"public","OA_place":"publisher","file":[{"success":1,"access_level":"open_access","creator":"dernst","content_type":"application/pdf","date_updated":"2025-08-04T08:23:47Z","file_size":704989,"file_id":"20111","relation":"main_file","checksum":"dbc27120e9aba67dffbd9e5d513a6803","file_name":"2025_ICLR_Jin.pdf","date_created":"2025-08-04T08:23:47Z"}],"OA_type":"diamond","page":"85165-85181","abstract":[{"text":"Pruning eliminates unnecessary parameters in neural networks; it offers a promising solution to the growing computational demands of large language models (LLMs). While many focus on post-training pruning, sparse pre-training--which combines pruning and pre-training into a single phase--provides a simpler alternative. In this work, we present the first systematic exploration of optimal sparse pre-training configurations for LLMs through an examination of 80 unique pruning schedules across different sparsity levels and training durations. We find that initiating pruning at 25% of total training compute and concluding at 75% achieves near-optimal final evaluation loss. These findings provide valuable insights for efficient and effective sparse pre-training of LLMs. Furthermore, we propose a new scaling law that modifies the Chinchilla scaling law to use the average parameter count over pre-training. Through empirical and theoretical validation, we demonstrate that this modified scaling law accurately models evaluation loss for both sparsely and densely pre-trained LLMs, unifying scaling laws across pre-training paradigms. Our findings indicate that while sparse pre-training achieves the same final model quality as dense pre-training for equivalent compute budgets, it provides substantial benefits through reduced model size, enabling significant potential computational savings during inference.","lang":"eng"}]},{"ddc":["000"],"title":"In the search of optimal tree networks: Hardness and heuristics","date_updated":"2025-12-01T12:35:24Z","_id":"20224","month":"07","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"type":"conference","oa_version":"Published Version","day":"13","doi":"10.1145/3712256.3726425","has_accepted_license":"1","scopus_import":"1","conference":{"end_date":"2025-07-18","start_date":"2025-07-14","location":"Malaga, Spain","name":"GECCO: Genetic and evolutionary computation conference"},"publication_status":"published","department":[{"_id":"DaAl"}],"oa":1,"citation":{"ista":"Martynov P, Buzdalov M, Pankratov S, Aksenov V, Schmid S. 2025. In the search of optimal tree networks: Hardness and heuristics. Proceedings of the 2025 Genetic and Evolutionary Computation Conference. GECCO: Genetic and evolutionary computation conference, 249–257.","ieee":"P. Martynov, M. Buzdalov, S. Pankratov, V. Aksenov, and S. Schmid, “In the search of optimal tree networks: Hardness and heuristics,” in <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i>, Malaga, Spain, 2025, pp. 249–257.","apa":"Martynov, P., Buzdalov, M., Pankratov, S., Aksenov, V., &#38; Schmid, S. (2025). In the search of optimal tree networks: Hardness and heuristics. In <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i> (pp. 249–257). Malaga, Spain: Association for Computing Machinery. <a href=\"https://doi.org/10.1145/3712256.3726425\">https://doi.org/10.1145/3712256.3726425</a>","mla":"Martynov, Pavel, et al. “In the Search of Optimal Tree Networks: Hardness and Heuristics.” <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i>, Association for Computing Machinery, 2025, pp. 249–57, doi:<a href=\"https://doi.org/10.1145/3712256.3726425\">10.1145/3712256.3726425</a>.","chicago":"Martynov, Pavel, Maxim Buzdalov, Sergei Pankratov, Vitaliy Aksenov, and Stefan Schmid. “In the Search of Optimal Tree Networks: Hardness and Heuristics.” In <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i>, 249–57. Association for Computing Machinery, 2025. <a href=\"https://doi.org/10.1145/3712256.3726425\">https://doi.org/10.1145/3712256.3726425</a>.","ama":"Martynov P, Buzdalov M, Pankratov S, Aksenov V, Schmid S. In the search of optimal tree networks: Hardness and heuristics. In: <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i>. Association for Computing Machinery; 2025:249-257. doi:<a href=\"https://doi.org/10.1145/3712256.3726425\">10.1145/3712256.3726425</a>","short":"P. Martynov, M. Buzdalov, S. Pankratov, V. Aksenov, S. Schmid, in:, Proceedings of the 2025 Genetic and Evolutionary Computation Conference, Association for Computing Machinery, 2025, pp. 249–257."},"publisher":"Association for Computing Machinery","file_date_updated":"2025-09-02T07:41:13Z","acknowledgement":"Research was supported by the German Research Foundation (DFG), grant 470029389 (FlexNets).","year":"2025","isi":1,"language":[{"iso":"eng"}],"publication":"Proceedings of the 2025 Genetic and Evolutionary Computation Conference","publication_identifier":{"isbn":["9798400714658"]},"author":[{"last_name":"Martynov","first_name":"Pavel","full_name":"Martynov, Pavel"},{"first_name":"Maxim","full_name":"Buzdalov, Maxim","last_name":"Buzdalov"},{"first_name":"Sergei","id":"f773bf05-72ef-11ef-b75a-a383d22f454b","full_name":"Pankratov, Sergei","last_name":"Pankratov"},{"first_name":"Vitaliy","full_name":"Aksenov, Vitaliy","last_name":"Aksenov"},{"last_name":"Schmid","first_name":"Stefan","full_name":"Schmid, Stefan"}],"quality_controlled":"1","abstract":[{"text":"Traffic in datacenters may follow some pattern: some pairs of servers communicate more frequently than others. Demand-oblivious networks may perform poorly for such workloads, and demand-aware networks optimized for traffic should be used instead. Unfortunately, not all shapes of networks are feasible in real hardware. Practical limitations are usually provided in the form of a topology. For example, a network may be required to be a binary tree, a bounded-degree graph or a Fat tree.\r\nIn this work, we consider a topology of a binary tree, one of the most fundamental network topologies. We show that already finding an optimal demand-aware binary tree network is NP-hard. Then, we explore how various optimization techniques, including simple local searches, as well as deterministic mutation and crossover operators, cope with generating efficient tree networks on real-life and synthetic workloads.","lang":"eng"}],"OA_type":"hybrid","page":"249-257","file":[{"file_id":"20273","date_updated":"2025-09-02T07:41:13Z","file_size":608996,"content_type":"application/pdf","success":1,"access_level":"open_access","creator":"dernst","date_created":"2025-09-02T07:41:13Z","file_name":"2025_GECCO_Martynov.pdf","checksum":"7e513fa508cff7e8a0d33f50b1fe09af","relation":"main_file"}],"external_id":{"isi":["001556459900031"]},"status":"public","OA_place":"publisher","article_processing_charge":"Yes (in subscription journal)","date_published":"2025-07-13T00:00:00Z","date_created":"2025-08-24T22:01:31Z"},{"title":"“Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization","ddc":["000"],"corr_author":"1","date_updated":"2025-11-26T11:15:11Z","_id":"20684","month":"08","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","type":"conference","oa_version":"Published Version","day":"01","scopus_import":"1","has_accepted_license":"1","conference":{"start_date":"2025-07-27","end_date":"2025-08-01","name":"ACL: Meeting of the Association for Computational Linguistics","location":"Vienna, Austria"},"publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"}],"oa":1,"citation":{"short":"E. Kurtic, A. Marques, S. Pandit, M. Kurtz, D.-A. Alistarh, in:, Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, 2025, pp. 26872–26886.","ama":"Kurtic E, Marques A, Pandit S, Kurtz M, Alistarh D-A. “Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization. In: <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i>. Association for Computational Linguistics; 2025:26872-26886.","chicago":"Kurtic, Eldar, Alexandre Marques, Shubhra Pandit, Mark Kurtz, and Dan-Adrian Alistarh. “‘Give Me BF16 or Give Me Death’? Accuracy-Performance Trade-Offs in LLM Quantization.” In <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i>, 26872–86. Association for Computational Linguistics, 2025.","mla":"Kurtic, Eldar, et al. “‘Give Me BF16 or Give Me Death’? Accuracy-Performance Trade-Offs in LLM Quantization.” <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i>, Association for Computational Linguistics, 2025, pp. 26872–86.","ieee":"E. Kurtic, A. Marques, S. Pandit, M. Kurtz, and D.-A. Alistarh, “‘Give me BF16 or give me death’? Accuracy-performance trade-offs in LLM quantization,” in <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i>, Vienna, Austria, 2025, pp. 26872–26886.","apa":"Kurtic, E., Marques, A., Pandit, S., Kurtz, M., &#38; Alistarh, D.-A. (2025). “Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization. In <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i> (pp. 26872–26886). Vienna, Austria: Association for Computational Linguistics.","ista":"Kurtic E, Marques A, Pandit S, Kurtz M, Alistarh D-A. 2025. “Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization. Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics. ACL: Meeting of the Association for Computational Linguistics, 26872–26886."},"publisher":"Association for Computational Linguistics","file_date_updated":"2025-11-26T11:06:57Z","year":"2025","language":[{"iso":"eng"}],"publication":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics","publication_identifier":{"isbn":["9798891762510"],"issn":["0736-587X"]},"author":[{"full_name":"Kurtic, Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","first_name":"Eldar","last_name":"Kurtic"},{"last_name":"Marques","full_name":"Marques, Alexandre","first_name":"Alexandre"},{"last_name":"Pandit","full_name":"Pandit, Shubhra","first_name":"Shubhra"},{"last_name":"Kurtz","full_name":"Kurtz, Mark","first_name":"Mark"},{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"quality_controlled":"1","abstract":[{"text":"Quantization is a powerful tool for accelerating large language model (LLM) inference, but the accuracy-performance trade-offs across different formats remain unclear. In this paper, we conduct the most comprehensive empirical study to date, evaluating FP8, INT8, and INT4\r\nquantization across academic benchmarks and real-world tasks on the entire Llama-3.1 model\r\nfamily. Through over 500,000 evaluations, our investigation yields several key findings: (1) FP8 (W8A8-FP) is effectively lossless across all model scales, (2) well-tuned INT8 (W8A8-INT) achieves surprisingly low (1-3%) accuracy degradation, and (3) INT4 weightonly (W4A16-INT) is more competitive than expected, rivaling 8-bit quantization. Further, we investigate the optimal quantization format for different deployments by analyzing inference performance through the popular vLLM framework. Our analysis provides clear deployment recommendations: W4A16 is the most cost-efficient for synchronous setups, while W8A8 dominates in asynchronous\r\ncontinuous batching. For mixed workloads, the optimal choice depends on the specific use\r\ncase. Our findings offer practical, data-driven guidelines for deploying quantized LLMs at scale—ensuring the best balance between speed, efficiency, and accuracy. ","lang":"eng"}],"OA_type":"gold","page":"26872-26886","file":[{"relation":"main_file","file_name":"2025_ACL_Kurtic.pdf","checksum":"4c066ee20f9ab17619c95652c0eb75f1","date_created":"2025-11-26T11:06:57Z","success":1,"creator":"dernst","access_level":"open_access","content_type":"application/pdf","date_updated":"2025-11-26T11:06:57Z","file_size":417450,"file_id":"20698"}],"external_id":{"arxiv":["2411.02355"]},"OA_place":"publisher","status":"public","article_processing_charge":"No","date_created":"2025-11-24T14:20:46Z","date_published":"2025-08-01T00:00:00Z"},{"ec_funded":1,"type":"journal_article","pmid":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","corr_author":"1","title":"Scalable multitemperature free energy sampling of classical Ising spin states","month":"10","_id":"20704","date_updated":"2025-12-01T15:40:27Z","intvolume":"        21","department":[{"_id":"BiCh"},{"_id":"DaAl"}],"publication_status":"published","publisher":"American Chemical Society","citation":{"ista":"Tuo P, Zeng Z, Chen J, Cheng B. 2025. Scalable multitemperature free energy sampling of classical Ising spin states. Journal of Chemical Theory and Computation. 21(22), 11427–11435.","apa":"Tuo, P., Zeng, Z., Chen, J., &#38; Cheng, B. (2025). Scalable multitemperature free energy sampling of classical Ising spin states. <i>Journal of Chemical Theory and Computation</i>. American Chemical Society. <a href=\"https://doi.org/10.1021/acs.jctc.5c01248\">https://doi.org/10.1021/acs.jctc.5c01248</a>","ieee":"P. Tuo, Z. Zeng, J. Chen, and B. Cheng, “Scalable multitemperature free energy sampling of classical Ising spin states,” <i>Journal of Chemical Theory and Computation</i>, vol. 21, no. 22. American Chemical Society, pp. 11427–11435, 2025.","mla":"Tuo, Ping, et al. “Scalable Multitemperature Free Energy Sampling of Classical Ising Spin States.” <i>Journal of Chemical Theory and Computation</i>, vol. 21, no. 22, American Chemical Society, 2025, pp. 11427–35, doi:<a href=\"https://doi.org/10.1021/acs.jctc.5c01248\">10.1021/acs.jctc.5c01248</a>.","chicago":"Tuo, Ping, Zezhu Zeng, Jiale Chen, and Bingqing Cheng. “Scalable Multitemperature Free Energy Sampling of Classical Ising Spin States.” <i>Journal of Chemical Theory and Computation</i>. American Chemical Society, 2025. <a href=\"https://doi.org/10.1021/acs.jctc.5c01248\">https://doi.org/10.1021/acs.jctc.5c01248</a>.","short":"P. Tuo, Z. Zeng, J. Chen, B. Cheng, Journal of Chemical Theory and Computation 21 (2025) 11427–11435.","ama":"Tuo P, Zeng Z, Chen J, Cheng B. Scalable multitemperature free energy sampling of classical Ising spin states. <i>Journal of Chemical Theory and Computation</i>. 2025;21(22):11427-11435. doi:<a href=\"https://doi.org/10.1021/acs.jctc.5c01248\">10.1021/acs.jctc.5c01248</a>"},"doi":"10.1021/acs.jctc.5c01248","oa_version":"None","day":"31","project":[{"_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c","call_identifier":"H2020","name":"IST-BRIDGE: International postdoctoral program","grant_number":"101034413"}],"volume":21,"scopus_import":"1","acknowledged_ssus":[{"_id":"ScienComp"}],"quality_controlled":"1","issue":"22","author":[{"full_name":"Tuo, Ping","first_name":"Ping","id":"6e5644c0-c180-11ed-a2da-facc4c9f4f09","last_name":"Tuo"},{"last_name":"Zeng","id":"54a2c730-803f-11ed-ab7e-95b29d2680e7","first_name":"Zezhu","full_name":"Zeng, Zezhu","orcid":"0000-0001-5126-4928"},{"full_name":"Chen, Jiale","orcid":"0000-0001-5337-5875","first_name":"Jiale","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785","last_name":"Chen"},{"last_name":"Cheng","first_name":"Bingqing","id":"cbe3cda4-d82c-11eb-8dc7-8ff94289fcc9","orcid":"0000-0002-3584-9632","full_name":"Cheng, Bingqing"}],"acknowledgement":"P.T. acknowledges funding from FFG MAGNIFICO and the BIDMaP Postdoctoral Fellowship. Z.Z. acknowledges funding from the European Union’s Horizon 2020 research and innovation program under the Marie Skłodowska-Curie grant agreement No. 101034413. The authors acknowledge the research computing facilities provided by the Institute of Science and Technology Austria (ISTA), and resources of the National Energy Research Scientific Computing Center (NERSC), a Department of Energy Office of Science User Facility using NERSC award DOEERCAP0031751 ’GenAI@NERSC’. P.T. acknowledges valued discussions with Dr. Daniel King, Dr. Lei Wang, and Dr. Fuzhi Dai.","publication_identifier":{"eissn":["1549-9626"],"issn":["1549-9618"]},"publication":"Journal of Chemical Theory and Computation","language":[{"iso":"eng"}],"year":"2025","isi":1,"related_material":{"link":[{"url":"https://github.com/tuoping/alchemicalFES","relation":"software"}]},"article_type":"original","external_id":{"pmid":["41172130"],"isi":["001605927900001"]},"date_created":"2025-11-30T23:02:06Z","date_published":"2025-10-31T00:00:00Z","article_processing_charge":"No","status":"public","page":"11427-11435","OA_type":"closed access","abstract":[{"lang":"eng","text":"Generative models have advanced significantly in sampling material systems with continuous variables, such as atomistic structures. However, their application to discrete variables, like atom types or spin states, remains underexplored. In this work, we introduce a discrete flow matching model, tailored for systems with discrete phase-space coordinates (e.g., the Ising model or a multicomponent system on a lattice). This approach enables a single model to sample free energy surfaces over a wide temperature range with minimal training overhead, and the model generation is scalable to larger lattice sizes than those in the training set. We demonstrate our approach on the 2D Ising model, showing efficient and reliable free energy sampling. These results highlight the potential of flow matching for low-cost, scalable free energy sampling in discrete systems and suggest promising extensions to alchemical degrees of freedom in crystalline materials. The codebase developed for this work is openly available at https://github.com/tuoping/alchemicalFES."}]},{"publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"}],"intvolume":"        39","oa":1,"citation":{"short":"S. Talaei, M. Ansaripour, G. Nadiradze, D.-A. Alistarh, Proceedings of the 39th AAAI Conference on Artificial Intelligence 39 (2025) 20778–20786.","ama":"Talaei S, Ansaripour M, Nadiradze G, Alistarh D-A. Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence. <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>. 2025;39(19):20778-20786. doi:<a href=\"https://doi.org/10.1609/aaai.v39i19.34290\">10.1609/aaai.v39i19.34290</a>","chicago":"Talaei, Shayan, Matin Ansaripour, Giorgi Nadiradze, and Dan-Adrian Alistarh. “Hybrid Decentralized Optimization: Leveraging Both First- and Zeroth-Order Optimizers for Faster Convergence.” <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>. Association for the Advancement of Artificial Intelligence, 2025. <a href=\"https://doi.org/10.1609/aaai.v39i19.34290\">https://doi.org/10.1609/aaai.v39i19.34290</a>.","mla":"Talaei, Shayan, et al. “Hybrid Decentralized Optimization: Leveraging Both First- and Zeroth-Order Optimizers for Faster Convergence.” <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>, vol. 39, no. 19, Association for the Advancement of Artificial Intelligence, 2025, pp. 20778–86, doi:<a href=\"https://doi.org/10.1609/aaai.v39i19.34290\">10.1609/aaai.v39i19.34290</a>.","apa":"Talaei, S., Ansaripour, M., Nadiradze, G., &#38; Alistarh, D.-A. (2025). Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence. <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>. Association for the Advancement of Artificial Intelligence. <a href=\"https://doi.org/10.1609/aaai.v39i19.34290\">https://doi.org/10.1609/aaai.v39i19.34290</a>","ieee":"S. Talaei, M. Ansaripour, G. Nadiradze, and D.-A. Alistarh, “Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence,” <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>, vol. 39, no. 19. Association for the Advancement of Artificial Intelligence, pp. 20778–20786, 2025.","ista":"Talaei S, Ansaripour M, Nadiradze G, Alistarh D-A. 2025. Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence. Proceedings of the 39th AAAI Conference on Artificial Intelligence. 39(19), 20778–20786."},"publisher":"Association for the Advancement of Artificial Intelligence","oa_version":"Preprint","day":"11","main_file_link":[{"url":"https://doi.org/10.1609/aaai.v39i19.34290","open_access":"1"}],"doi":"10.1609/aaai.v39i19.34290","scopus_import":"1","volume":39,"project":[{"name":"Elastic Coordination for Scalable Machine Learning","grant_number":"805223","call_identifier":"H2020","_id":"268A44D6-B435-11E9-9278-68D0E5697425"}],"ec_funded":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","type":"journal_article","title":"Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence","corr_author":"1","date_updated":"2026-02-16T12:34:44Z","_id":"19713","month":"04","external_id":{"arxiv":["2210.07703"]},"article_type":"original","related_material":{"link":[{"url":"https://github.com/ShayanTalaei/HDO","relation":"software"}]},"OA_place":"publisher","status":"public","article_processing_charge":"No","date_created":"2025-05-19T14:15:35Z","date_published":"2025-04-11T00:00:00Z","abstract":[{"lang":"eng","text":"Distributed optimization is the standard way of speeding up machine learning training, and most of the research in the area focuses on distributed first-order, gradient-based methods. Yet, there are settings where some computationally-bounded nodes may not be able to implement first-order, gradient-based optimization, while they could still contribute to joint optimization tasks. In this paper, we initiate the study of hybrid decentralized optimization, studying settings where nodes with zeroth-order and first-order optimization capabilities co-exist in a distributed system, and attempt to jointly solve an optimization task over some data distribution. We essentially show that, under reasonable parameter settings, such a system can not only withstand noisier zeroth-order agents but can even benefit from integrating such agents into the optimization process, rather than ignoring their information. At the core of our approach is a new analysis of distributed optimization with noisy and possibly-biased gradient estimators, which may be of independent interest. Our results hold for both convex and non-convex objectives. Experimental results on standard optimization tasks confirm our analysis, showing that hybrid first-zeroth order optimization can be practical, even when training deep neural networks."}],"page":"20778-20786","OA_type":"free access","author":[{"last_name":"Talaei","full_name":"Talaei, Shayan","first_name":"Shayan"},{"last_name":"Ansaripour","full_name":"Ansaripour, Matin","first_name":"Matin"},{"orcid":"0000-0001-5634-0731","full_name":"Nadiradze, Giorgi","first_name":"Giorgi","id":"3279A00C-F248-11E8-B48F-1D18A9856A87","last_name":"Nadiradze"},{"last_name":"Alistarh","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian"}],"issue":"19","quality_controlled":"1","acknowledgement":"This project has received funding from the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement\r\nNo 805223 ScaleML). The authors would like to acknowledge Eugenia Iofinova for useful discussions during the inception of this project.","year":"2025","publication":"Proceedings of the 39th AAAI Conference on Artificial Intelligence","language":[{"iso":"eng"}],"publication_identifier":{"issn":["2159-5399"],"eissn":["2374-3468"]}},{"file_date_updated":"2025-12-16T12:32:40Z","publication_identifier":{"eissn":["2640-3498"]},"year":"2025","publication":"42nd International Conference on Machine Learning","language":[{"iso":"eng"}],"quality_controlled":"1","author":[{"last_name":"Sieberling","first_name":"Oliver","full_name":"Sieberling, Oliver"},{"full_name":"Kuznedelev, Denis","first_name":"Denis","last_name":"Kuznedelev"},{"first_name":"Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","full_name":"Kurtic, Eldar","last_name":"Kurtic"},{"last_name":"Alistarh","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X"}],"file":[{"file_name":"2025_ICML_Sieberling.pdf","checksum":"1d744fbaeb199b08e8b6f48bc0dd047e","date_created":"2025-12-16T12:32:40Z","relation":"main_file","date_updated":"2025-12-16T12:32:40Z","file_size":908379,"file_id":"20828","success":1,"creator":"dernst","access_level":"open_access","content_type":"application/pdf"}],"OA_type":"gold","page":"55556-55590","abstract":[{"text":"The high computational costs of large language models (LLMs) have led to a flurry of research on LLM compression, via methods such as quantization, sparsification, or structured pruning. A new frontier in this area is given by dynamic, non-uniform compression methods, which adjust the compression levels (e.g., sparsity) per-block or even per-layer in order to minimize accuracy loss, while guaranteeing a global compression threshold. Yet, current methods rely on estimating the \"importance\" of a given layer, implicitly assuming that layers contribute independently to the overall compression error. We begin from the motivating observation that this independence assumption does not generally hold for LLM compression: pruning a model further may even significantly recover performance. To address this, we propose EvoPress, a novel evolutionary framework for dynamic LLM compression. By formulating dynamic compression as a general optimization problem, EvoPress identifies optimal compression profiles in a highly efficient manner, and generalizes across diverse models and compression techniques. Via EvoPress, we achieve state-of-the-art performance for dynamic compression of Llama, Mistral, and Phi models, setting new benchmarks for structural pruning (block/layer dropping), unstructured sparsity, and quantization with dynamic bitwidths.","lang":"eng"}],"external_id":{"arxiv":["2410.14649"]},"date_created":"2025-12-14T23:02:05Z","date_published":"2025-05-01T00:00:00Z","article_processing_charge":"No","status":"public","OA_place":"publisher","corr_author":"1","alternative_title":["PMLR"],"ddc":["000"],"title":"EvoPress: Accurate dynamic model compression via evolutionary search","month":"05","_id":"20820","date_updated":"2025-12-16T12:34:32Z","type":"conference","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","day":"01","oa_version":"Published Version","conference":{"end_date":"2025-07-19","start_date":"2025-07-13","name":"ICML: International Conference on Machine Learning","location":"Vancouver, Canada"},"volume":267,"scopus_import":"1","has_accepted_license":"1","intvolume":"       267","department":[{"_id":"DaAl"}],"arxiv":1,"publication_status":"published","publisher":"ML Research Press","citation":{"ieee":"O. Sieberling, D. Kuznedelev, E. Kurtic, and D.-A. Alistarh, “EvoPress: Accurate dynamic model compression via evolutionary search,” in <i>42nd International Conference on Machine Learning</i>, Vancouver, Canada, 2025, vol. 267, pp. 55556–55590.","ista":"Sieberling O, Kuznedelev D, Kurtic E, Alistarh D-A. 2025. EvoPress: Accurate dynamic model compression via evolutionary search. 42nd International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 267, 55556–55590.","apa":"Sieberling, O., Kuznedelev, D., Kurtic, E., &#38; Alistarh, D.-A. (2025). EvoPress: Accurate dynamic model compression via evolutionary search. In <i>42nd International Conference on Machine Learning</i> (Vol. 267, pp. 55556–55590). Vancouver, Canada: ML Research Press.","mla":"Sieberling, Oliver, et al. “EvoPress: Accurate Dynamic Model Compression via Evolutionary Search.” <i>42nd International Conference on Machine Learning</i>, vol. 267, ML Research Press, 2025, pp. 55556–90.","ama":"Sieberling O, Kuznedelev D, Kurtic E, Alistarh D-A. EvoPress: Accurate dynamic model compression via evolutionary search. In: <i>42nd International Conference on Machine Learning</i>. Vol 267. ML Research Press; 2025:55556-55590.","short":"O. Sieberling, D. Kuznedelev, E. Kurtic, D.-A. Alistarh, in:, 42nd International Conference on Machine Learning, ML Research Press, 2025, pp. 55556–55590.","chicago":"Sieberling, Oliver, Denis Kuznedelev, Eldar Kurtic, and Dan-Adrian Alistarh. “EvoPress: Accurate Dynamic Model Compression via Evolutionary Search.” In <i>42nd International Conference on Machine Learning</i>, 267:55556–90. ML Research Press, 2025."},"oa":1},{"external_id":{"arxiv":["2505.14371"]},"status":"public","OA_place":"publisher","article_processing_charge":"No","date_published":"2025-05-01T00:00:00Z","date_created":"2025-12-14T23:02:06Z","abstract":[{"text":"Modern deep neural networks exhibit heterogeneity across numerous layers of various types such as residuals, multi-head attention, etc., due to varying structures (dimensions, activation functions, etc.), distinct representation characteristics, which impact predictions. We develop a general layer-wise quantization framework with tight variance and code-length bounds, adapting to the heterogeneities over the course of training. We then apply a new layer-wise quantization technique within distributed variational inequalities (VIs), proposing a novel Quantized Optimistic Dual Averaging (QODA) algorithm with adaptive learning rates, which achieves competitive convergence rates for monotone VIs. We empirically show that QODA achieves up to a 150% speedup over the baselines in end-to-end training time for training Wasserstein GAN on 12+GPUs.","lang":"eng"}],"OA_type":"gold","page":"46026-46072","file":[{"content_type":"application/pdf","creator":"dernst","access_level":"open_access","success":1,"file_id":"20830","file_size":756213,"date_updated":"2025-12-16T12:45:41Z","relation":"main_file","date_created":"2025-12-16T12:45:41Z","file_name":"2025_ICML_Nguyen.pdf","checksum":"a7edf0e4304171a3e035842b3aab1704"}],"author":[{"last_name":"Nguyen","first_name":"Anh Duc","full_name":"Nguyen, Anh Duc"},{"last_name":"Markov","full_name":"Markov, Ilia","id":"D0CF4148-C985-11E9-8066-0BDEE5697425","first_name":"Ilia"},{"last_name":"Wu","full_name":"Wu, Frank Zhengqing","first_name":"Frank Zhengqing"},{"full_name":"Ramezani-Kebrya, Ali","first_name":"Ali","last_name":"Ramezani-Kebrya"},{"last_name":"Antonakopoulos","full_name":"Antonakopoulos, Kimon","first_name":"Kimon"},{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"},{"last_name":"Cevher","full_name":"Cevher, Volkan","first_name":"Volkan"}],"quality_controlled":"1","file_date_updated":"2025-12-16T12:45:41Z","acknowledgement":"This work was supported by Hasler Foundation Program: Hasler Responsible AI (project number 21043). The research was also sponsored by the Army Research Office and was accomplished under Grant Number W911NF-24-1-0048. This work was further funded by the Swiss National Science Foundation (SNSF) under grant number 200021_205011. We also acknowledge project A11 of the Swiss National Supercomputing Centre (CSCS) for providing computing resources. Dan Alistarh and Ilia Markov were supported in part through the ERC Proofof-Concept grant FastML (Grant Agreement 101158077). Ali Ramezani-Kebrya was supported by the Research Council of Norway through FRIPRO Grant under project number 356103, its Centres of Excellence scheme, Integreat - Norwegian Centre for knowledge-driven machine learning under\r\nproject number 332645 - and its Centre for Research-based Innovation funding scheme (Visual Intelligence under grant no. 309439).","language":[{"iso":"eng"}],"year":"2025","publication":"42nd International Conference on Machine Learning","publication_identifier":{"eissn":["2640-3498"]},"arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"}],"intvolume":"       267","oa":1,"citation":{"apa":"Nguyen, A. D., Markov, I., Wu, F. Z., Ramezani-Kebrya, A., Antonakopoulos, K., Alistarh, D.-A., &#38; Cevher, V. (2025). Layer-wise quantization for quantized optimistic dual averaging. In <i>42nd International Conference on Machine Learning</i> (Vol. 267, pp. 46026–46072). Vancouver, Canada: ML Research Press.","ista":"Nguyen AD, Markov I, Wu FZ, Ramezani-Kebrya A, Antonakopoulos K, Alistarh D-A, Cevher V. 2025. Layer-wise quantization for quantized optimistic dual averaging. 42nd International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 267, 46026–46072.","ieee":"A. D. Nguyen <i>et al.</i>, “Layer-wise quantization for quantized optimistic dual averaging,” in <i>42nd International Conference on Machine Learning</i>, Vancouver, Canada, 2025, vol. 267, pp. 46026–46072.","mla":"Nguyen, Anh Duc, et al. “Layer-Wise Quantization for Quantized Optimistic Dual Averaging.” <i>42nd International Conference on Machine Learning</i>, vol. 267, ML Research Press, 2025, pp. 46026–72.","chicago":"Nguyen, Anh Duc, Ilia Markov, Frank Zhengqing Wu, Ali Ramezani-Kebrya, Kimon Antonakopoulos, Dan-Adrian Alistarh, and Volkan Cevher. “Layer-Wise Quantization for Quantized Optimistic Dual Averaging.” In <i>42nd International Conference on Machine Learning</i>, 267:46026–72. ML Research Press, 2025.","ama":"Nguyen AD, Markov I, Wu FZ, et al. Layer-wise quantization for quantized optimistic dual averaging. In: <i>42nd International Conference on Machine Learning</i>. Vol 267. ML Research Press; 2025:46026-46072.","short":"A.D. Nguyen, I. Markov, F.Z. Wu, A. Ramezani-Kebrya, K. Antonakopoulos, D.-A. Alistarh, V. Cevher, in:, 42nd International Conference on Machine Learning, ML Research Press, 2025, pp. 46026–46072."},"publisher":"ML Research Press","oa_version":"Published Version","day":"01","has_accepted_license":"1","volume":267,"scopus_import":"1","project":[{"_id":"8e35c14b-16d5-11f0-9cad-a3fc35339161","grant_number":"101158077","name":"FastML: Efficient and Cost-Effective Distributed Machine Learning"}],"conference":{"end_date":"2025-07-19","start_date":"2025-07-13","location":"Vancouver, Canada","name":"ICML: International Conference on Machine Learning"},"tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","type":"conference","title":"Layer-wise quantization for quantized optimistic dual averaging","ddc":["000"],"alternative_title":["PMLR"],"date_updated":"2025-12-16T12:46:54Z","_id":"20821","month":"05"},{"acknowledgement":"The work of Dan Alistarh is supported by grants from ERC, Austrian FWF, and the Google and NVIDIA corporations. Faith Ellen was supported in part by the Natural Science and Engineering Research Council of Canada (NSERC) grant RGPIN-2020-04178.","file_date_updated":"2026-02-18T06:46:02Z","year":"2025","language":[{"iso":"eng"}],"publication":"39th International Symposium on Distributed Computing","author":[{"first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh"},{"first_name":"Faith","full_name":"Ellen, Faith","last_name":"Ellen"},{"id":"2e711909-896a-11ed-bdf8-eb0f5a2984c6","first_name":"Alexander","full_name":"Fedorov, Alexander","last_name":"Fedorov"}],"quality_controlled":"1","abstract":[{"text":"We investigate the step complexity of the Leader Election problem (and implementing the corresponding test-and-set object) in asynchronous shared memory, where processes communicate through registers supporting atomic read and write and must coordinate so that a single process becomes the leader. Determining tight step complexity bounds for solving this problem is one of the key open problems in the theory of shared memory distributed computing. The best known algorithm is a randomized tournament-tree, which has worst-case expected step complexity O(log N) for N processes. There are provably no deterministic wait-free algorithms, and only restricted lower bounds are known for obstruction-free and randomized wait-free algorithms. We introduce a new lower bound that establishes an Ω((log N)/(log log N + log Q)) step complexity for any obstruction-free Leader Election algorithm, where N is the number of processes, and 2 ≤ Q ≤ N is a bound on the value contention, which we define as the maximum number of different values that processes can be simultaneously poised to write to the same register in any execution of the algorithm. Our result is strictly stronger than previous bounds based on write contention. In particular, it implies new lower bounds on step complexity that depend on register size.","lang":"eng"}],"page":"3:1-3:16","OA_type":"gold","file":[{"file_size":1492189,"date_updated":"2026-02-18T06:46:02Z","file_id":"21310","creator":"dernst","access_level":"open_access","success":1,"content_type":"application/pdf","file_name":"2025_LIPIcs_Alistarh.pdf","checksum":"3825a0e6e6a05503e842a59f95528bd9","date_created":"2026-02-18T06:46:02Z","relation":"main_file"}],"OA_place":"publisher","status":"public","article_processing_charge":"Yes","date_published":"2025-10-22T00:00:00Z","date_created":"2026-02-16T15:41:15Z","ddc":["000"],"title":"An almost-logarithmic lower bound for leader election with bounded value contention","alternative_title":["LIPIcs"],"corr_author":"1","date_updated":"2026-02-18T06:49:38Z","_id":"21250","month":"10","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)"},"type":"conference","day":"22","oa_version":"Published Version","doi":"10.4230/LIPIcs.DISC.2025.3","volume":356,"has_accepted_license":"1","conference":{"location":"Berlin, Germany","name":"DISC: Symposium on Distributed Computing","start_date":"2025-10-27","end_date":"2025-10-31"},"publication_status":"published","department":[{"_id":"DaAl"},{"_id":"GradSch"}],"intvolume":"       356","oa":1,"citation":{"short":"D.-A. Alistarh, F. Ellen, A. Fedorov, in:, 39th International Symposium on Distributed Computing, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, 2025, p. 3:1-3:16.","ama":"Alistarh D-A, Ellen F, Fedorov A. An almost-logarithmic lower bound for leader election with bounded value contention. In: <i>39th International Symposium on Distributed Computing</i>. Vol 356. Schloss Dagstuhl - Leibniz-Zentrum für Informatik; 2025:3:1-3:16. doi:<a href=\"https://doi.org/10.4230/LIPIcs.DISC.2025.3\">10.4230/LIPIcs.DISC.2025.3</a>","chicago":"Alistarh, Dan-Adrian, Faith Ellen, and Alexander Fedorov. “An Almost-Logarithmic Lower Bound for Leader Election with Bounded Value Contention.” In <i>39th International Symposium on Distributed Computing</i>, 356:3:1-3:16. Schloss Dagstuhl - Leibniz-Zentrum für Informatik, 2025. <a href=\"https://doi.org/10.4230/LIPIcs.DISC.2025.3\">https://doi.org/10.4230/LIPIcs.DISC.2025.3</a>.","mla":"Alistarh, Dan-Adrian, et al. “An Almost-Logarithmic Lower Bound for Leader Election with Bounded Value Contention.” <i>39th International Symposium on Distributed Computing</i>, vol. 356, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, 2025, p. 3:1-3:16, doi:<a href=\"https://doi.org/10.4230/LIPIcs.DISC.2025.3\">10.4230/LIPIcs.DISC.2025.3</a>.","apa":"Alistarh, D.-A., Ellen, F., &#38; Fedorov, A. (2025). An almost-logarithmic lower bound for leader election with bounded value contention. In <i>39th International Symposium on Distributed Computing</i> (Vol. 356, p. 3:1-3:16). Berlin, Germany: Schloss Dagstuhl - Leibniz-Zentrum für Informatik. <a href=\"https://doi.org/10.4230/LIPIcs.DISC.2025.3\">https://doi.org/10.4230/LIPIcs.DISC.2025.3</a>","ieee":"D.-A. Alistarh, F. Ellen, and A. Fedorov, “An almost-logarithmic lower bound for leader election with bounded value contention,” in <i>39th International Symposium on Distributed Computing</i>, Berlin, Germany, 2025, vol. 356, p. 3:1-3:16.","ista":"Alistarh D-A, Ellen F, Fedorov A. 2025. An almost-logarithmic lower bound for leader election with bounded value contention. 39th International Symposium on Distributed Computing. DISC: Symposium on Distributed Computing, LIPIcs, vol. 356, 3:1-3:16."},"publisher":"Schloss Dagstuhl - Leibniz-Zentrum für Informatik"},{"type":"book_chapter","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","alternative_title":["Machine Translation: Technologies and Applications"],"corr_author":"1","title":"Sparse Fine-Tuning for Inference Acceleration of Large Language Models","month":"07","date_updated":"2026-02-19T09:26:54Z","_id":"21257","arxiv":1,"publication_status":"published","department":[{"_id":"DaAl"},{"_id":"GradSch"}],"publisher":"Springer Nature","oa":1,"citation":{"mla":"Kurtic, Eldar, et al. “Sparse Fine-Tuning for Inference Acceleration of Large Language Models.” <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i>, edited by Peyman Passban et al., Springer Nature, 2025, pp. 83–97, doi:<a href=\"https://doi.org/10.1007/978-3-031-85747-8_6\">10.1007/978-3-031-85747-8_6</a>.","ista":"Kurtic E, Kuznedelev D, Frantar E, Goinv M, Pandit S, Agarwalla A, Nguyen T, Marques A, Kurtz M, Alistarh D-A. 2025.Sparse Fine-Tuning for Inference Acceleration of Large Language Models. In: Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques. Machine Translation: Technologies and Applications, , 83–97.","ieee":"E. Kurtic <i>et al.</i>, “Sparse Fine-Tuning for Inference Acceleration of Large Language Models,” in <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i>, P. Passban, A. Way, and M. Rezagholizadeh, Eds. Springer Nature, 2025, pp. 83–97.","apa":"Kurtic, E., Kuznedelev, D., Frantar, E., Goinv, M., Pandit, S., Agarwalla, A., … Alistarh, D.-A. (2025). Sparse Fine-Tuning for Inference Acceleration of Large Language Models. In P. Passban, A. Way, &#38; M. Rezagholizadeh (Eds.), <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i> (pp. 83–97). Springer Nature. <a href=\"https://doi.org/10.1007/978-3-031-85747-8_6\">https://doi.org/10.1007/978-3-031-85747-8_6</a>","chicago":"Kurtic, Eldar, Denis Kuznedelev, Elias Frantar, Michael Goinv, Shubhra Pandit, Abhinav Agarwalla, Tuan Nguyen, Alexandre Marques, Mark Kurtz, and Dan-Adrian Alistarh. “Sparse Fine-Tuning for Inference Acceleration of Large Language Models.” In <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i>, edited by Peyman Passban, Andy Way, and Mehdi Rezagholizadeh, 83–97. Springer Nature, 2025. <a href=\"https://doi.org/10.1007/978-3-031-85747-8_6\">https://doi.org/10.1007/978-3-031-85747-8_6</a>.","ama":"Kurtic E, Kuznedelev D, Frantar E, et al. Sparse Fine-Tuning for Inference Acceleration of Large Language Models. In: Passban P, Way A, Rezagholizadeh M, eds. <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i>. Springer Nature; 2025:83-97. doi:<a href=\"https://doi.org/10.1007/978-3-031-85747-8_6\">10.1007/978-3-031-85747-8_6</a>","short":"E. Kurtic, D. Kuznedelev, E. Frantar, M. Goinv, S. Pandit, A. Agarwalla, T. Nguyen, A. Marques, M. Kurtz, D.-A. Alistarh, in:, P. Passban, A. Way, M. Rezagholizadeh (Eds.), Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques, Springer Nature, 2025, pp. 83–97."},"day":"05","oa_version":"Preprint","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2310.06927","open_access":"1"}],"doi":"10.1007/978-3-031-85747-8_6","quality_controlled":"1","author":[{"full_name":"Kurtic, Eldar","first_name":"Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","last_name":"Kurtic"},{"last_name":"Kuznedelev","full_name":"Kuznedelev, Denis","first_name":"Denis"},{"last_name":"Frantar","first_name":"Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","full_name":"Frantar, Elias"},{"full_name":"Goinv, Michael","first_name":"Michael","last_name":"Goinv"},{"first_name":"Shubhra","full_name":"Pandit, Shubhra","last_name":"Pandit"},{"full_name":"Agarwalla, Abhinav","first_name":"Abhinav","last_name":"Agarwalla"},{"first_name":"Tuan","full_name":"Nguyen, Tuan","last_name":"Nguyen"},{"full_name":"Marques, Alexandre","first_name":"Alexandre","last_name":"Marques"},{"full_name":"Kurtz, Mark","first_name":"Mark","last_name":"Kurtz"},{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"acknowledgement":"We would like to thank Eugenia Iofinova for useful comments on an earlier version of this draft, and Artur Niederfahrenhorst for useful suggestions regarding fine-tuning on the GSM8k dataset.","publication_identifier":{"issn":["2522-8021"],"eisbn":["9783031857478"],"eissn":["2522-803X"],"isbn":["9783031857461"]},"year":"2025","language":[{"iso":"eng"}],"publication":"Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques","editor":[{"last_name":"Passban","full_name":"Passban, Peyman","first_name":"Peyman"},{"last_name":"Way","full_name":"Way, Andy","first_name":"Andy"},{"full_name":"Rezagholizadeh, Mehdi","first_name":"Mehdi","last_name":"Rezagholizadeh"}],"external_id":{"arxiv":["2310.06927"]},"article_processing_charge":"No","date_published":"2025-07-05T00:00:00Z","date_created":"2026-02-16T15:57:53Z","status":"public","OA_place":"repository","abstract":[{"text":"We investigate the problem of accurate sparse fine-tuning of large language models (LLMs), that is, fine-tuning pre-trained LLMs on specialized tasks, while inducing sparsity in their weights. Our work is motivated by experiments showing that standard loss-based fine-tuning methods are not able to achieve high accuracy in this setting, especially at high sparsity targets. To address this issue, we perform a detailed study of knowledge distillation losses for fine-tuning of sparse models. We determine an L2-based distillation approach that we term ‘SquareHead’, which enables accurate recovery even at higher sparsities. Investigating the question of efficient inference, we show that sparse LLMs can be executed faster by taking advantage of sparsity. Specifically, we exhibit end-to-end results showing speedups enabled by sparsity, while recovering accuracy, on the following models and tasks, respectively: T5 for language translation, Whisper for speech translation, and open GPT-type models such as the Mosaic Pre-Trained Transformer (MPT) and Llama-2 models for text generation. In particular, for popular generative tasks, we show for the first time that sparse fine-tuning can reach 75% sparsity without drops in accuracy, and provide notable end-to-end speedups for inference on CPUs. Moreover, we also highlight that sparsity is compatible with other compression approaches, such as quantization.","lang":"eng"}],"OA_type":"green","page":"83-97"},{"type":"preprint","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","month":"06","_id":"21858","date_updated":"2026-05-19T11:20:27Z","corr_author":"1","title":"Position: It's time to act on the risk of efficient personalized text generation","citation":{"chicago":"Iofinova, Eugenia B, Andrej Jovanovic, and Dan-Adrian Alistarh. “Position: It’s Time to Act on the Risk of Efficient Personalized Text Generation.” <i>ArXiv</i>, n.d. <a href=\"https://doi.org/10.48550/arXiv.2502.06560\">https://doi.org/10.48550/arXiv.2502.06560</a>.","ama":"Iofinova EB, Jovanovic A, Alistarh D-A. Position: It’s time to act on the risk of efficient personalized text generation. <i>arXiv</i>. doi:<a href=\"https://doi.org/10.48550/arXiv.2502.06560\">10.48550/arXiv.2502.06560</a>","short":"E.B. Iofinova, A. Jovanovic, D.-A. Alistarh, ArXiv (n.d.).","ista":"Iofinova EB, Jovanovic A, Alistarh D-A. Position: It’s time to act on the risk of efficient personalized text generation. arXiv, <a href=\"https://doi.org/10.48550/arXiv.2502.06560\">10.48550/arXiv.2502.06560</a>.","apa":"Iofinova, E. B., Jovanovic, A., &#38; Alistarh, D.-A. (n.d.). Position: It’s time to act on the risk of efficient personalized text generation. <i>arXiv</i>. <a href=\"https://doi.org/10.48550/arXiv.2502.06560\">https://doi.org/10.48550/arXiv.2502.06560</a>","ieee":"E. B. Iofinova, A. Jovanovic, and D.-A. Alistarh, “Position: It’s time to act on the risk of efficient personalized text generation,” <i>arXiv</i>. .","mla":"Iofinova, Eugenia B., et al. “Position: It’s Time to Act on the Risk of Efficient Personalized Text Generation.” <i>ArXiv</i>, doi:<a href=\"https://doi.org/10.48550/arXiv.2502.06560\">10.48550/arXiv.2502.06560</a>."},"oa":1,"department":[{"_id":"GradSch"},{"_id":"DaAl"}],"publication_status":"draft","arxiv":1,"project":[{"name":"FastML: Efficient and Cost-Effective Distributed Machine Learning","grant_number":"101158077","_id":"8e35c14b-16d5-11f0-9cad-a3fc35339161"},{"_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","name":"Vienna Graduate School on Computational Optimization","grant_number":"W1260-N35"}],"doi":"10.48550/arXiv.2502.06560","day":"02","oa_version":"Preprint","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2502.06560","open_access":"1"}],"author":[{"last_name":"Iofinova","orcid":"0000-0002-7778-3221","full_name":"Iofinova, Eugenia B","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","first_name":"Eugenia B"},{"last_name":"Jovanovic","first_name":"Andrej","full_name":"Jovanovic, Andrej"},{"last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"language":[{"iso":"eng"}],"year":"2025","publication":"arXiv","acknowledgement":"This research was supported by the Scientific Service Units (SSU) of IST Austria through resources\r\nprovided by Scientific Computing (SciComp). EI was supported in part by the FWF DK VGSCO,\r\ngrant agreement number W1260-N35. AJ was supported in part by ERC Proof-of-Concept Grant\r\nFastML, grant agreement 101158077.","date_created":"2026-05-11T08:55:23Z","date_published":"2025-06-02T00:00:00Z","article_processing_charge":"No","OA_place":"repository","status":"public","related_material":{"record":[{"id":"21854","status":"public","relation":"dissertation_contains"}]},"external_id":{"arxiv":["2502.06560"]},"OA_type":"green","abstract":[{"lang":"eng","text":"The recent surge in high-quality open-source Generative AI text models (colloquially: LLMs), as well as efficient finetuning techniques, have opened the possibility of creating high-quality personalized models that generate text attuned to a specific individual’s needs and are capable of credibly imitating their writing style by refining an open-source model with that person’s own data. The technology to create such models is accessible to private individuals, and training and running such models can be done cheaply on consumer-grade hardware. While these advancements are a huge gain for usability and privacy, this position paper argues that the practical feasibility of impersonating specific individuals also introduces novel safety risks. For instance, this technology enables the creation of phishing emails\r\nor fraudulent social media accounts, based on small amounts of publicly available text, or by the individuals themselves to escape AI text detection. We further argue that these risks are complementary to—and distinct from—the much-discussed risks of other impersonation attacks such as image, voice, or video deepfakes, and are not adequately addressed by the larger research community, or the current generation of open- and closed-source models."}]},{"corr_author":"1","title":"Federated SGD with local asynchrony","publication_identifier":{"isbn":["9798350386059"],"eissn":["2575-8411"],"issn":["1063-6927"]},"month":"07","_id":"18070","isi":1,"language":[{"iso":"eng"}],"year":"2024","publication":"Proceedings of the 44th International Conference on Distributed Computing Systems","date_updated":"2025-09-08T09:23:48Z","type":"conference","quality_controlled":"1","author":[{"last_name":"Chatterjee","id":"3C41A08A-F248-11E8-B48F-1D18A9856A87","first_name":"Bapi","full_name":"Chatterjee, Bapi","orcid":"0000-0002-2742-4028"},{"last_name":"Kungurtsev","full_name":"Kungurtsev, Vyacheslav","first_name":"Vyacheslav"},{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh"}],"user_id":"317138e5-6ab7-11ef-aa6d-ffef3953e345","doi":"10.1109/ICDCS60910.2024.00084","day":"26","oa_version":"None","conference":{"start_date":"2024-07-23","end_date":"2024-07-26","location":"Jersey City, NJ, United States","name":"ICDCS: International Conference on Distributed Computing Systems"},"scopus_import":"1","page":"857-868","abstract":[{"lang":"eng","text":"Parallel SGD in a shared-memory setting is oft-represented by the popular Hogwild! algorithm, in which lock-free updates are asynchronously performed by multiple computing processes. Unfortunately, scaling Hogwild! to distributed workers is largely unexplored. Specifically, it is unknown if any adaptation of Hogwild! to the popular decentralized multi-GPU setting offers any competitive speedup, either empirically or theoretically. In this work, we investigate the potential of decentralizing Hogwild! by incorporating simultaneously (a) asynchronous local gradient updates on the shared memory of GPUs, and (b) non-blocking asynchronous decentralized federated averaging. A naive direct implementation shows degradation in performance, arising from scheduling overheads and concurrent write conflicts on GPUs. To mitigate these drawbacks, we investigate and propose a new method, based on careful block selection rules, which update only portions of the parameter vectors. Our experiments show that the resulting decentralized training method exhibits improved throughput and competitive accuracy for standard image classification benchmarks on the CIFAR-10, CIFAR-100, and Imagenet datasets. On the theoretical side, we prove that our method guarantees sublinear ergodic convergence rates for non-convex objectives."}],"department":[{"_id":"DaAl"}],"external_id":{"isi":["001304430200075"]},"publication_status":"published","date_created":"2024-09-15T22:01:41Z","date_published":"2024-07-26T00:00:00Z","article_processing_charge":"No","publisher":"IEEE","citation":{"mla":"Chatterjee, Bapi, et al. “Federated SGD with Local Asynchrony.” <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i>, IEEE, 2024, pp. 857–68, doi:<a href=\"https://doi.org/10.1109/ICDCS60910.2024.00084\">10.1109/ICDCS60910.2024.00084</a>.","ieee":"B. Chatterjee, V. Kungurtsev, and D.-A. Alistarh, “Federated SGD with local asynchrony,” in <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i>, Jersey City, NJ, United States, 2024, pp. 857–868.","ista":"Chatterjee B, Kungurtsev V, Alistarh D-A. 2024. Federated SGD with local asynchrony. Proceedings of the 44th International Conference on Distributed Computing Systems. ICDCS: International Conference on Distributed Computing Systems, 857–868.","apa":"Chatterjee, B., Kungurtsev, V., &#38; Alistarh, D.-A. (2024). Federated SGD with local asynchrony. In <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i> (pp. 857–868). Jersey City, NJ, United States: IEEE. <a href=\"https://doi.org/10.1109/ICDCS60910.2024.00084\">https://doi.org/10.1109/ICDCS60910.2024.00084</a>","short":"B. Chatterjee, V. Kungurtsev, D.-A. Alistarh, in:, Proceedings of the 44th International Conference on Distributed Computing Systems, IEEE, 2024, pp. 857–868.","ama":"Chatterjee B, Kungurtsev V, Alistarh D-A. Federated SGD with local asynchrony. In: <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i>. IEEE; 2024:857-868. doi:<a href=\"https://doi.org/10.1109/ICDCS60910.2024.00084\">10.1109/ICDCS60910.2024.00084</a>","chicago":"Chatterjee, Bapi, Vyacheslav Kungurtsev, and Dan-Adrian Alistarh. “Federated SGD with Local Asynchrony.” In <i>Proceedings of the 44th International Conference on Distributed Computing Systems</i>, 857–68. IEEE, 2024. <a href=\"https://doi.org/10.1109/ICDCS60910.2024.00084\">https://doi.org/10.1109/ICDCS60910.2024.00084</a>."},"status":"public"},{"article_processing_charge":"No","date_published":"2024-09-01T00:00:00Z","date_created":"2024-09-22T22:01:43Z","status":"public","external_id":{"arxiv":["2401.06118"]},"abstract":[{"lang":"eng","text":"The emergence of accurate open large language models (LLMs) has led to a race towards performant quantization techniques which can enable their execution on end-user devices. In this paper, we revisit the problem of “extreme” LLM compression—defined as targeting extremely low bit counts, such as 2 to 3 bits per parameter—from the point of view of classic methods in Multi-Codebook Quantization (MCQ). Our algorithm, called AQLM, generalizes the classic Additive Quantization (AQ) approach for information retrieval to advance the state-of-the-art in LLM compression, via two innovations: 1) learned additive quantization of weight matrices in input-adaptive fashion, and 2) joint optimization of codebook parameters across each transformer blocks. Broadly, AQLM is the first scheme that is Pareto optimal in terms of accuracy-vs-model-size when compressing to less than 3 bits per parameter, and significantly improves upon all known schemes in the extreme compression (2bit) regime. In addition, AQLM is practical: we provide fast GPU and CPU implementations of AQLM for token generation, which enable us to match or outperform optimized FP16 implementations for speed, while executing in a much smaller memory footprint."}],"page":"12284-12303","quality_controlled":"1","author":[{"full_name":"Egiazarian, Vage","first_name":"Vage","last_name":"Egiazarian"},{"full_name":"Panferov, Andrei","first_name":"Andrei","id":"2c18daae-4dbe-11ef-8491-98ce2d960f09","last_name":"Panferov"},{"last_name":"Kuznedelev","full_name":"Kuznedelev, Denis","first_name":"Denis"},{"first_name":"Elias","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","full_name":"Frantar, Elias","last_name":"Frantar"},{"full_name":"Babenko, Artem","first_name":"Artem","last_name":"Babenko"},{"last_name":"Alistarh","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian"}],"publication_identifier":{"eissn":["2640-3498"]},"language":[{"iso":"eng"}],"publication":"Proceedings of the 41st International Conference on Machine Learning","year":"2024","acknowledgement":"Authors would like to thank Ruslan Svirschevski for his help in solving technical issues with AQLM and baselines. We also thank Tim Dettmers for helpful discussions on the structure of weights in modern LLMs and size-accuracy trade-offs. The authors would also like to thank Daniil Pavlov for his assistance with CPU benchmarking. Finally, authors would like to thank the communities of ML enthusiasts known as LocalLLaMA5 and Petals community on discord6\r\nfor the crowd wisdom about running LLMs on consumer devices. Egiazarian Vage and Denis Kuznedelev and Andrei Panferov were supported by the grant for research centers in the field of AI provided by the Analytical Center for the Government of the Russian Federation (ACRF) in\r\naccordance with the agreement on the provision of subsidies (identifier of the agreement 000000D730321P5Q0002) and the agreement with HSE University No. 70-2021-00139.","publisher":"ML Research Press","oa":1,"citation":{"ista":"Egiazarian V, Panferov A, Kuznedelev D, Frantar E, Babenko A, Alistarh D-A. 2024. Extreme compression of large language models via additive quantization. Proceedings of the 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 235, 12284–12303.","ieee":"V. Egiazarian, A. Panferov, D. Kuznedelev, E. Frantar, A. Babenko, and D.-A. Alistarh, “Extreme compression of large language models via additive quantization,” in <i>Proceedings of the 41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 12284–12303.","apa":"Egiazarian, V., Panferov, A., Kuznedelev, D., Frantar, E., Babenko, A., &#38; Alistarh, D.-A. (2024). Extreme compression of large language models via additive quantization. In <i>Proceedings of the 41st International Conference on Machine Learning</i> (Vol. 235, pp. 12284–12303). Vienna, Austria: ML Research Press.","mla":"Egiazarian, Vage, et al. “Extreme Compression of Large Language Models via Additive Quantization.” <i>Proceedings of the 41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 12284–303.","ama":"Egiazarian V, Panferov A, Kuznedelev D, Frantar E, Babenko A, Alistarh D-A. Extreme compression of large language models via additive quantization. In: <i>Proceedings of the 41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:12284-12303.","short":"V. Egiazarian, A. Panferov, D. Kuznedelev, E. Frantar, A. Babenko, D.-A. Alistarh, in:, Proceedings of the 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 12284–12303.","chicago":"Egiazarian, Vage, Andrei Panferov, Denis Kuznedelev, Elias Frantar, Artem Babenko, and Dan-Adrian Alistarh. “Extreme Compression of Large Language Models via Additive Quantization.” In <i>Proceedings of the 41st International Conference on Machine Learning</i>, 235:12284–303. ML Research Press, 2024."},"intvolume":"       235","publication_status":"published","arxiv":1,"department":[{"_id":"DaAl"},{"_id":"GradSch"}],"conference":{"name":"ICML: International Conference on Machine Learning","location":"Vienna, Austria","end_date":"2024-07-27","start_date":"2024-07-21"},"scopus_import":"1","volume":235,"day":"01","main_file_link":[{"url":" https://doi.org/10.48550/arXiv.2401.06118","open_access":"1"}],"oa_version":"Preprint","type":"conference","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","month":"09","date_updated":"2024-10-01T08:13:05Z","_id":"18113","alternative_title":["PMLR"],"corr_author":"1","title":"Extreme compression of large language models via additive quantization"}]
