[{"date_published":"2026-04-06T00:00:00Z","day":"06","file_date_updated":"2026-06-03T05:51:19Z","department":[{"_id":"DaAl"}],"page":"265-284","type":"conference","publication_status":"published","file":[{"file_size":2099944,"access_level":"open_access","file_name":"2026_CPAL_Schultheis.pdf","creator":"dernst","checksum":"72f9a87c70f1e2105ef64050ee5017e5","content_type":"application/pdf","file_id":"21942","date_created":"2026-06-03T05:51:19Z","success":1,"relation":"main_file","date_updated":"2026-06-03T05:51:19Z"}],"intvolume":"       328","alternative_title":["PMLR"],"author":[{"first_name":"Erik","full_name":"Schultheis, Erik","last_name":"Schultheis","id":"2786b299-e6b0-11f0-91da-9243fe3ef96b"},{"first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X"}],"publication_identifier":{"eissn":["2640-3498"]},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","oa_version":"Published Version","related_material":{"link":[{"url":"https://github.com/IST-DASLab/llmq","relation":"software"}]},"publication":"2nd Conference on Parsimony and Learning","corr_author":"1","acknowledgement":"We would like to thank contacts at NVIDIA (Vartika Singh, Nina Carrejo, Kyla Wilkes, and Tijmen\r\nBlankevoort), HP (Curtis Burkhalter), and Datacrunch/Verda (Paul Chang and Antonio\r\nDominguez) for hardware support that was essential to this project. ES was supported in part\r\nby ERC Proof-of-Concept grant FastML.","date_updated":"2026-06-03T05:53:30Z","ddc":["000"],"_id":"21932","quality_controlled":"1","article_processing_charge":"No","oa":1,"conference":{"location":"Stanford, CA, United States","name":"CPAL: Conference on Parsimony and Learning","start_date":"2025-03-24","end_date":"2025-03-27"},"OA_type":"diamond","status":"public","date_created":"2026-05-31T22:02:13Z","month":"04","citation":{"apa":"Schultheis, E., &#38; Alistarh, D.-A. (2026). LLMQ: Efficient lower-precision LLM training for consumer GPUs. In <i>2nd Conference on Parsimony and Learning</i> (Vol. 328, pp. 265–284). Stanford, CA, United States: ML Research Press.","ama":"Schultheis E, Alistarh D-A. LLMQ: Efficient lower-precision LLM training for consumer GPUs. In: <i>2nd Conference on Parsimony and Learning</i>. Vol 328. ML Research Press; 2026:265-284.","ieee":"E. Schultheis and D.-A. Alistarh, “LLMQ: Efficient lower-precision LLM training for consumer GPUs,” in <i>2nd Conference on Parsimony and Learning</i>, Stanford, CA, United States, 2026, vol. 328, pp. 265–284.","mla":"Schultheis, Erik, and Dan-Adrian Alistarh. “LLMQ: Efficient Lower-Precision LLM Training for Consumer GPUs.” <i>2nd Conference on Parsimony and Learning</i>, vol. 328, ML Research Press, 2026, pp. 265–84.","chicago":"Schultheis, Erik, and Dan-Adrian Alistarh. “LLMQ: Efficient Lower-Precision LLM Training for Consumer GPUs.” In <i>2nd Conference on Parsimony and Learning</i>, 328:265–84. ML Research Press, 2026.","ista":"Schultheis E, Alistarh D-A. 2026. LLMQ: Efficient lower-precision LLM training for consumer GPUs. 2nd Conference on Parsimony and Learning. CPAL: Conference on Parsimony and Learning, PMLR, vol. 328, 265–284.","short":"E. Schultheis, D.-A. Alistarh, in:, 2nd Conference on Parsimony and Learning, ML Research Press, 2026, pp. 265–284."},"volume":328,"language":[{"iso":"eng"}],"OA_place":"publisher","publisher":"ML Research Press","title":"LLMQ: Efficient lower-precision LLM training for consumer GPUs","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"abstract":[{"lang":"eng","text":"We present LLMQ, an end-to-end CUDA/C++ implementation for medium-sized language-model training, e.g. 3B to 32B parameters, on affordable, commodity GPUs. These devices are characterized by low memory availability and slow communication compared to datacentre-grade GPUs. Consequently, we showcase a range of optimizations that target these bottlenecks, including activation checkpointing, offloading, and copy-engine based collectives. LLMQ is able to train or fine-tune a 7B model on a single 16GB mid-range gaming card, or a 32B model on a workstation equipped with 4 RTX 4090s. This is achieved while executing a standard 8-bit training pipeline, without additional algorithmic approximations, and maintaining FLOP utilization of around 50%. The efficiency of LLMQ rivals that of production-scale systems on much more expensive cloud-grade GPUs."}],"has_accepted_license":"1","year":"2026","scopus_import":"1"},{"publication_status":"published","page":"6404–6418","type":"conference","department":[{"_id":"DaAl"},{"_id":"GradSch"}],"day":"01","date_published":"2026-04-01T00:00:00Z","date_updated":"2026-07-14T06:18:11Z","corr_author":"1","publication":"Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics","oa_version":"Preprint","arxiv":1,"supplementarymaterial":"no","researchdata_availability":"no","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","author":[{"full_name":"Pankratov, Sergei","first_name":"Sergei","last_name":"Pankratov","id":"f773bf05-72ef-11ef-b75a-a383d22f454b"},{"orcid":"0000-0003-3650-940X","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"}],"doi":"10.18653/v1/2026.eacl-long.301","OA_place":"repository","language":[{"iso":"eng"}],"citation":{"short":"S. Pankratov, D.-A. Alistarh, in:, Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics, Association for Computational Linguistics, 2026, pp. 6404–6418.","chicago":"Pankratov, Sergei, and Dan-Adrian Alistarh. “Speculative Decoding Speed-of-Light: Optimal Lower Bounds via Branching Random Walks.” In <i>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics</i>, 6404–6418. Association for Computational Linguistics, 2026. <a href=\"https://doi.org/10.18653/v1/2026.eacl-long.301\">https://doi.org/10.18653/v1/2026.eacl-long.301</a>.","ista":"Pankratov S, Alistarh D-A. 2026. Speculative decoding speed-of-light: Optimal lower bounds via branching random walks. Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics. EACL:  Conference of the European Chapter of the Association for Computational Linguistics, 6404–6418.","mla":"Pankratov, Sergei, and Dan-Adrian Alistarh. “Speculative Decoding Speed-of-Light: Optimal Lower Bounds via Branching Random Walks.” <i>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics</i>, Association for Computational Linguistics, 2026, pp. 6404–6418, doi:<a href=\"https://doi.org/10.18653/v1/2026.eacl-long.301\">10.18653/v1/2026.eacl-long.301</a>.","ieee":"S. Pankratov and D.-A. Alistarh, “Speculative decoding speed-of-light: Optimal lower bounds via branching random walks,” in <i>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics</i>, Rabat, Morocco, 2026, pp. 6404–6418.","ama":"Pankratov S, Alistarh D-A. Speculative decoding speed-of-light: Optimal lower bounds via branching random walks. In: <i>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics</i>. Association for Computational Linguistics; 2026:6404–6418. doi:<a href=\"https://doi.org/10.18653/v1/2026.eacl-long.301\">10.18653/v1/2026.eacl-long.301</a>","apa":"Pankratov, S., &#38; Alistarh, D.-A. (2026). Speculative decoding speed-of-light: Optimal lower bounds via branching random walks. In <i>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics</i> (pp. 6404–6418). Rabat, Morocco: Association for Computational Linguistics. <a href=\"https://doi.org/10.18653/v1/2026.eacl-long.301\">https://doi.org/10.18653/v1/2026.eacl-long.301</a>"},"month":"04","date_created":"2026-07-13T10:48:03Z","das_tickbox":"0","status":"public","OA_type":"green","external_id":{"arxiv":["2512.11718"]},"conference":{"location":"Rabat, Morocco","end_date":"2026-03-29","name":"EACL:  Conference of the European Chapter of the Association for Computational Linguistics","start_date":"2026-03-24"},"article_processing_charge":"No","_id":"22302","quality_controlled":"1","scopus_import":"1","year":"2026","abstract":[{"lang":"eng","text":"Speculative generation has emerged as a promising technique to accelerate inference in large language models (LLMs) by leveraging parallelism to verify multiple draft tokens simultaneously. However, the fundamental limits on the achievable speedup remain poorly understood. In this work, we establish the first “tight” lower bounds on the runtime of any deterministic speculative generation algorithm. This is achieved by drawing a parallel between the token generation process and branching random walks, which allows us to analyze the optimal draft tree selection problem. We prove, under basic assumptions, that the expected number of tokens successfully predicted per speculative iteration is bounded as \\mathbb{E}[X] ≤ (𝜇 + 𝜇(2))log(B )/𝜇2 + O(1), where B is the verifier’s batch size, 𝜇 is the expected entropy of the verifier’s output distribution, and 𝜇(2) is this entropy’s second moment. This result provides new insights into the limits of parallel token generation, and could guide the design of future speculative decoding systems. Empirical evaluations on Llama models validate our theoretical predictions, confirming the tightness of our bounds in practical settings."}],"title":"Speculative decoding speed-of-light: Optimal lower bounds via branching random walks","publisher":"Association for Computational Linguistics"},{"related_material":{"record":[{"id":"14771","status":"public","relation":"part_of_dissertation"},{"id":"18121","status":"public","relation":"part_of_dissertation"},{"status":"public","id":"21858","relation":"part_of_dissertation"},{"id":"21859","status":"public","relation":"part_of_dissertation"},{"relation":"part_of_dissertation","id":"21857","status":"public"}]},"acknowledged_ssus":[{"_id":"ScienComp"}],"corr_author":"1","acknowledgement":"The research in this Ph.D. was funded in whole\r\nor in part by the Austrian Science Fund (FWF) W1260-N35 (Vienna Graduate School for\r\nComputational Optimization). For open access purposes the author has applied a CC BY\r\npublic copyright license to any author accepted manuscript version arising from this submission\r\nwherever possible. Additionally, I am grateful to Alois Schlögl, Waleed Khalid, and the rest of\r\nthe ISTA Scientific Computing team for building and maintaining the infrastructure I used\r\nto run experiments. I’m also deeply grateful to the Alistarh group’s administrative assistant,\r\nChristine Francois, who always deals with our nonsense with common sense and a smile.\r\n","date_updated":"2026-07-27T12:50:04Z","publication_identifier":{"issn":["2663-337X"]},"doi_confirm":"1","author":[{"first_name":"Eugenia B","full_name":"Iofinova, Eugenia B","orcid":"0000-0002-7778-3221","last_name":"Iofinova","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117"}],"user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","oa_version":"Published Version","publication_status":"published","file":[{"date_created":"2026-05-11T08:36:01Z","date_updated":"2026-05-11T08:36:01Z","relation":"source_file","file_name":"EIofinova_thesis_FinalVersion.zip","access_level":"closed","file_size":28479571,"file_id":"21856","content_type":"application/zip","creator":"eiofinov","checksum":"2e148dad920e3f9b7c32796e0ba2e5f7"},{"access_level":"open_access","file_size":18137757,"file_name":"2026_Iofinova_Eugenia_Thesis.pdf","creator":"eiofinov","checksum":"b10c2933f386f532b2dbf28b19c5525c","file_id":"21877","content_type":"application/pdf","success":1,"date_created":"2026-05-13T13:10:48Z","relation":"main_file","date_updated":"2026-05-13T13:10:48Z"}],"alternative_title":["ISTA Thesis"],"date_published":"2026-05-11T00:00:00Z","day":"11","file_date_updated":"2026-05-13T13:10:48Z","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"page":"237","type":"dissertation","year":"2026","has_accepted_license":"1","publisher":"Institute of Science and Technology Austria","title":"On the utility and effects of efficiency in artificial neural networks","abstract":[{"lang":"eng","text":"As neural-network-based models grow both in size and popularity, interest has grown in making the models smaller and more efficient to train. To that end, many methods have been proposed to prune models by reducing their number of nonzero parameters. Additionally, parameter-efficient fine-tuning, in which a much smaller number of parameters than the total contained in the model is updated during training, has become very popular, especially in the space of Large Language Models. At the same time, the increasingly routine deployment of machine learning in real-world applications has spurred a drive to make them more trustworthy - in the sense of, among other things, being unbiased, interpretable, and editable. In this thesis, we examine the interplay between efficiency and trustworthiness.\r\n\r\nFirst, we analyze the effects of model pruning on bias in computer vision models, demonstrating that increased sparsity leads to greater bias, largely as a function of increased model uncertainty in marginal cases. Based on this observation, we propose several bias mitigation techniques. Then, we demonstrate that example-specific model pruning can improve model interpretation methods while improving pruning efficiency to make example-specific model pruning feasible in real time. Then, we investigate the effectiveness of parameter-efficient and data-efficient model personalization via fine-tuning, demonstrating that it is highly feasible with very small computational and data resources. Finally, we consider efficiency in editing model knowledge using a custom synthetic data framework, demonstrating that parameter-efficient, low-rank fine-tuning frequently outperforms full-rank fine-tuning, and, additionally, that restricting which model blocks are fine-tuned frequently improves results. Together, the results in this thesis provide new insights and techniques for combining trustworthiness and efficiency during neural network inference and training.\r\n\r\n"}],"publisher_comment":"In reference to IEEE copyrighted material which is used with permission in this thesis, the IEEE does not endorse any of ISTA's products or services. Internal or personal use of this material is permitted. If interested in reprinting/republishing IEEE copyrighted material for advertising or promotional purposes or for creating new collective works for resale or redistribution, please go to http://www.ieee.org/publications_standards/publications/rights/rights_link.html to learn how to obtain a License from RightsLink. If applicable, University Microfilms and/or ProQuest Library, or the Archives of Canada may supply single copies of the dissertation.","das_tickbox":"1","status":"public","project":[{"name":"Vienna Graduate School on Computational Optimization","grant_number":"W1260-N35","_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A"}],"date_created":"2026-05-11T08:43:22Z","month":"05","citation":{"short":"E.B. Iofinova, On the Utility and Effects of Efficiency in Artificial Neural Networks, Institute of Science and Technology Austria, 2026.","ista":"Iofinova EB. 2026. On the utility and effects of efficiency in artificial neural networks. Institute of Science and Technology Austria.","chicago":"Iofinova, Eugenia B. “On the Utility and Effects of Efficiency in Artificial Neural Networks.” Institute of Science and Technology Austria, 2026. <a href=\"https://doi.org/10.15479/AT-ISTA-21854\">https://doi.org/10.15479/AT-ISTA-21854</a>.","ama":"Iofinova EB. On the utility and effects of efficiency in artificial neural networks. 2026. doi:<a href=\"https://doi.org/10.15479/AT-ISTA-21854\">10.15479/AT-ISTA-21854</a>","mla":"Iofinova, Eugenia B. <i>On the Utility and Effects of Efficiency in Artificial Neural Networks</i>. Institute of Science and Technology Austria, 2026, doi:<a href=\"https://doi.org/10.15479/AT-ISTA-21854\">10.15479/AT-ISTA-21854</a>.","ieee":"E. B. Iofinova, “On the utility and effects of efficiency in artificial neural networks,” Institute of Science and Technology Austria, 2026.","apa":"Iofinova, E. B. (2026). <i>On the utility and effects of efficiency in artificial neural networks</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/AT-ISTA-21854\">https://doi.org/10.15479/AT-ISTA-21854</a>"},"degree_awarded":"PhD","language":[{"iso":"eng"}],"doi":"10.15479/AT-ISTA-21854","OA_place":"publisher","supervisor":[{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian"}],"ddc":["000"],"article_processing_charge":"No","_id":"21854","oa":1},{"year":"2026","title":"Panza: Investigating the feasibility of fully-local personalized text generation","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"publisher":"OpenReview","keyword":["LLMs","PEFT","LoRA","personalization","efficient ML"],"abstract":[{"text":"The availability of powerful open-source large language models (LLMs) opens exciting use cases, such as using personal data to fine-tune these models to imitate a user’s unique writing style. Two key requirements for this functionality are personalization–in the sense that the output should recognizably reflect the user’s own writing style—and privacy–users may justifiably be wary of uploading extremely personal data, such as their email archive, to a third-party service. In this paper, we demonstrate the feasibility of training and running such an assistant, which we call Panza, on commodity hardware, for the specific use case of email generation. Panza’s personalization features are based on a combination of parameter-efficient fine-tuning using a variant of the Reverse Instructions technique [1] and Retrieval-Augmented Generation (RAG) [2]. We demonstrate that this combination allows us to fine-tune an LLM to reflect a user’s writing style using limited data, while executing on extremely limited resources, e.g. on a free Google Colab instance. Our key methodological contribution is the first detailed study of evaluation metrics for this task, and\r\nof how different choices of system components–the use of RAG and of different fine-tuning approaches–impact the system’s performance. Additionally, we demonstrate that very little data - under 100 email samples - are sufficient to create models that convincingly imitate humans, showcasing a previously unknown attack vector in language models. We are releasing the full Panza code as well as three new email datasets licensed for research use.","lang":"eng"}],"citation":{"chicago":"Nicolicioiu, Armand, Eugenia B Iofinova, Andrej Jovanovic, Eldar Kurtic, Mahdi Nikdan, Andrei Panferov, Ilia Markov, Nir Shavit, and Dan-Adrian Alistarh. <i>Panza: Investigating the Feasibility of Fully-Local Personalized Text Generation</i>. <i>Third Conference on Parsimony and Learning (Proceedings Track)</i>. OpenReview, 2026.","ista":"Nicolicioiu A, Iofinova EB, Jovanovic A, Kurtic E, Nikdan M, Panferov A, Markov I, Shavit N, Alistarh D-A. 2026. Panza: Investigating the feasibility of fully-local personalized text generation, OpenReview,p.","short":"A. Nicolicioiu, E.B. Iofinova, A. Jovanovic, E. Kurtic, M. Nikdan, A. Panferov, I. Markov, N. Shavit, D.-A. Alistarh, Panza: Investigating the Feasibility of Fully-Local Personalized Text Generation, OpenReview, 2026.","apa":"Nicolicioiu, A., Iofinova, E. B., Jovanovic, A., Kurtic, E., Nikdan, M., Panferov, A., … Alistarh, D.-A. (2026). <i>Panza: Investigating the feasibility of fully-local personalized text generation</i>. <i>Third Conference on Parsimony and Learning (Proceedings Track)</i>. Tübíngen, Germany: OpenReview.","ieee":"A. Nicolicioiu <i>et al.</i>, <i>Panza: Investigating the feasibility of fully-local personalized text generation</i>. OpenReview, 2026.","mla":"Nicolicioiu, Armand, et al. “Panza: Investigating the Feasibility of Fully-Local Personalized Text Generation.” <i>Third Conference on Parsimony and Learning (Proceedings Track)</i>, 81, OpenReview, 2026.","ama":"Nicolicioiu A, Iofinova EB, Jovanovic A, et al. <i>Panza: Investigating the Feasibility of Fully-Local Personalized Text Generation</i>. OpenReview; 2026."},"month":"03","date_created":"2026-05-11T08:50:28Z","status":"public","OA_place":"publisher","language":[{"iso":"eng"}],"article_processing_charge":"No","_id":"21857","quality_controlled":"1","OA_type":"green","conference":{"end_date":"2026-03-26","start_date":"2026-03-23","name":"CPAL: Conference on Parsimony and Learning","location":"Tübíngen, Germany"},"oa":1,"publication":"Third Conference on Parsimony and Learning (Proceedings Track)","related_material":{"record":[{"relation":"dissertation_contains","status":"public","id":"21854"}]},"date_updated":"2026-07-27T12:50:03Z","corr_author":"1","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","author":[{"first_name":"Armand","full_name":"Nicolicioiu, Armand","last_name":"Nicolicioiu"},{"first_name":"Eugenia B","full_name":"Iofinova, Eugenia B","last_name":"Iofinova","orcid":"0000-0002-7778-3221","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117"},{"last_name":"Jovanovic","full_name":"Jovanovic, Andrej","first_name":"Andrej"},{"last_name":"Kurtic","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","full_name":"Kurtic, Eldar","first_name":"Eldar"},{"full_name":"Nikdan, Mahdi","first_name":"Mahdi","id":"66374281-f394-11eb-9cf6-869147deecc0","last_name":"Nikdan"},{"full_name":"Panferov, Andrei","first_name":"Andrei","id":"2c18daae-4dbe-11ef-8491-98ce2d960f09","last_name":"Panferov"},{"full_name":"Markov, Ilia","first_name":"Ilia","last_name":"Markov","id":"D0CF4148-C985-11E9-8066-0BDEE5697425"},{"last_name":"Shavit","full_name":"Shavit, Nir","first_name":"Nir"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"}],"main_file_link":[{"url":"https://openreview.net/pdf?id=soFWnTqd23","open_access":"1"}],"oa_version":"Accepted Version","article_number":"81","publication_status":"published","date_published":"2026-03-06T00:00:00Z","type":"conference_poster","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"day":"06"},{"publication_status":"draft","date_published":"2026-01-30T00:00:00Z","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"type":"preprint","day":"30","publication":"arXiv","acknowledged_ssus":[{"_id":"ScienComp"}],"related_material":{"record":[{"relation":"dissertation_contains","status":"public","id":"21854"}]},"acknowledgement":"EI thanks Weiwei Yang, Janardhan Kulkani, and Kate Lytvynets for their advice and support in\r\ndeveloping an earlier version of the Behemoth library. This research was supported by the Scientific\r\nService Units (SSU) of IST Austria through resources provided by Scientific Computing (SciComp).\r\nEI was supported in part by the FWF DK VGSCO, grant agreement number W1260-N35.\r\n","date_updated":"2026-07-27T12:50:03Z","corr_author":"1","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","author":[{"id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","last_name":"Iofinova","orcid":"0000-0002-7778-3221","full_name":"Iofinova, Eugenia B","first_name":"Eugenia B"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"}],"arxiv":1,"oa_version":"Preprint","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2601.23153"}],"month":"01","citation":{"short":"E.B. Iofinova, D.-A. Alistarh, ArXiv (n.d.).","ista":"Iofinova EB, Alistarh D-A. Behemoth: Benchmarking unlearning in LLMs using fully synthetic data. arXiv, <a href=\"https://doi.org/10.48550/arXiv.2601.23153\">10.48550/arXiv.2601.23153</a>.","chicago":"Iofinova, Eugenia B, and Dan-Adrian Alistarh. “Behemoth: Benchmarking Unlearning in LLMs Using Fully Synthetic Data.” <i>ArXiv</i>, n.d. <a href=\"https://doi.org/10.48550/arXiv.2601.23153\">https://doi.org/10.48550/arXiv.2601.23153</a>.","ieee":"E. B. Iofinova and D.-A. Alistarh, “Behemoth: Benchmarking unlearning in LLMs using fully synthetic data,” <i>arXiv</i>. .","mla":"Iofinova, Eugenia B., and Dan-Adrian Alistarh. “Behemoth: Benchmarking Unlearning in LLMs Using Fully Synthetic Data.” <i>ArXiv</i>, doi:<a href=\"https://doi.org/10.48550/arXiv.2601.23153\">10.48550/arXiv.2601.23153</a>.","ama":"Iofinova EB, Alistarh D-A. Behemoth: Benchmarking unlearning in LLMs using fully synthetic data. <i>arXiv</i>. doi:<a href=\"https://doi.org/10.48550/arXiv.2601.23153\">10.48550/arXiv.2601.23153</a>","apa":"Iofinova, E. B., &#38; Alistarh, D.-A. (n.d.). Behemoth: Benchmarking unlearning in LLMs using fully synthetic data. <i>arXiv</i>. <a href=\"https://doi.org/10.48550/arXiv.2601.23153\">https://doi.org/10.48550/arXiv.2601.23153</a>"},"status":"public","date_created":"2026-05-11T08:58:07Z","project":[{"_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","grant_number":"W1260-N35","name":"Vienna Graduate School on Computational Optimization"}],"OA_place":"repository","doi":"10.48550/arXiv.2601.23153","language":[{"iso":"eng"}],"_id":"21859","article_processing_charge":"No","external_id":{"arxiv":["2601.23153"]},"OA_type":"green","oa":1,"year":"2026","title":"Behemoth: Benchmarking unlearning in LLMs using fully synthetic data","abstract":[{"text":"As artificial neural networks, and specifically large language models, have improved rapidly in capabilities and quality, they have increasingly been deployed in real-world applications, from customer service to Google search, despite the fact that they frequently make factually incorrect or undesirable statements. This trend has inspired practical and academic interest in model editing, that is, in adjusting the weights of the model to modify its likely outputs for queries relating to a specific fact or set of facts. This may be done either to amend a fact or set of facts, for instance, to fix a frequent error in the training data, or to suppress a fact or set of facts entirely, for instance, in case of dangerous knowledge. Multiple methods have been proposed to do such edits. However, at the same time, it has been shown that such model editing can be brittle and incomplete. Moreover the effectiveness of any model editing method necessarily depends on the data on which the model is trained, and, therefore, a good understanding of the interaction of the training data distribution and the way it is stored in the network is necessary and helpful to reliably perform model editing. However, working with large language models trained on real-world data does not allow us to understand this relationship or fully measure the effects of model editing. We therefore propose Behemoth, a fully synthetic data generation framework. To demonstrate the practical insights from the framework, we explore model editing in the context of simple tabular data, demonstrating surprising findings that, in some cases, echo real-world results, for instance, that in some cases restricting the update rank results in a more effective update.","lang":"eng"}]},{"scopus_import":"1","year":"2025","has_accepted_license":"1","abstract":[{"lang":"eng","text":"The high computational costs of large language models (LLMs) have led to a flurry of research on LLM compression, via methods such as quantization, sparsification, or structured pruning. A new frontier in this area is given by dynamic, non-uniform compression methods, which adjust the compression levels (e.g., sparsity) per-block or even per-layer in order to minimize accuracy loss, while guaranteeing a global compression threshold. Yet, current methods rely on estimating the \"importance\" of a given layer, implicitly assuming that layers contribute independently to the overall compression error. We begin from the motivating observation that this independence assumption does not generally hold for LLM compression: pruning a model further may even significantly recover performance. To address this, we propose EvoPress, a novel evolutionary framework for dynamic LLM compression. By formulating dynamic compression as a general optimization problem, EvoPress identifies optimal compression profiles in a highly efficient manner, and generalizes across diverse models and compression techniques. Via EvoPress, we achieve state-of-the-art performance for dynamic compression of Llama, Mistral, and Phi models, setting new benchmarks for structural pruning (block/layer dropping), unstructured sparsity, and quantization with dynamic bitwidths."}],"publisher":"ML Research Press","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"EvoPress: Accurate dynamic model compression via evolutionary search","language":[{"iso":"eng"}],"volume":267,"OA_place":"publisher","date_created":"2025-12-14T23:02:05Z","status":"public","citation":{"mla":"Sieberling, Oliver, et al. “EvoPress: Accurate Dynamic Model Compression via Evolutionary Search.” <i>42nd International Conference on Machine Learning</i>, vol. 267, ML Research Press, 2025, pp. 55556–90.","ieee":"O. Sieberling, D. Kuznedelev, E. Kurtic, and D.-A. Alistarh, “EvoPress: Accurate dynamic model compression via evolutionary search,” in <i>42nd International Conference on Machine Learning</i>, Vancouver, Canada, 2025, vol. 267, pp. 55556–55590.","ama":"Sieberling O, Kuznedelev D, Kurtic E, Alistarh D-A. EvoPress: Accurate dynamic model compression via evolutionary search. In: <i>42nd International Conference on Machine Learning</i>. Vol 267. ML Research Press; 2025:55556-55590.","apa":"Sieberling, O., Kuznedelev, D., Kurtic, E., &#38; Alistarh, D.-A. (2025). EvoPress: Accurate dynamic model compression via evolutionary search. In <i>42nd International Conference on Machine Learning</i> (Vol. 267, pp. 55556–55590). Vancouver, Canada: ML Research Press.","short":"O. Sieberling, D. Kuznedelev, E. Kurtic, D.-A. Alistarh, in:, 42nd International Conference on Machine Learning, ML Research Press, 2025, pp. 55556–55590.","ista":"Sieberling O, Kuznedelev D, Kurtic E, Alistarh D-A. 2025. EvoPress: Accurate dynamic model compression via evolutionary search. 42nd International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 267, 55556–55590.","chicago":"Sieberling, Oliver, Denis Kuznedelev, Eldar Kurtic, and Dan-Adrian Alistarh. “EvoPress: Accurate Dynamic Model Compression via Evolutionary Search.” In <i>42nd International Conference on Machine Learning</i>, 267:55556–90. ML Research Press, 2025."},"month":"05","oa":1,"external_id":{"arxiv":["2410.14649"]},"OA_type":"gold","conference":{"end_date":"2025-07-19","name":"ICML: International Conference on Machine Learning","start_date":"2025-07-13","location":"Vancouver, Canada"},"ddc":["000"],"_id":"20820","quality_controlled":"1","article_processing_charge":"No","corr_author":"1","date_updated":"2025-12-16T12:34:32Z","publication":"42nd International Conference on Machine Learning","arxiv":1,"oa_version":"Published Version","publication_identifier":{"eissn":["2640-3498"]},"author":[{"last_name":"Sieberling","full_name":"Sieberling, Oliver","first_name":"Oliver"},{"last_name":"Kuznedelev","first_name":"Denis","full_name":"Kuznedelev, Denis"},{"full_name":"Kurtic, Eldar","first_name":"Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","last_name":"Kurtic"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","publication_status":"published","intvolume":"       267","alternative_title":["PMLR"],"file":[{"creator":"dernst","checksum":"1d744fbaeb199b08e8b6f48bc0dd047e","content_type":"application/pdf","file_id":"20828","file_name":"2025_ICML_Sieberling.pdf","access_level":"open_access","file_size":908379,"date_updated":"2025-12-16T12:32:40Z","relation":"main_file","date_created":"2025-12-16T12:32:40Z","success":1}],"file_date_updated":"2025-12-16T12:32:40Z","day":"01","type":"conference","page":"55556-55590","department":[{"_id":"DaAl"}],"date_published":"2025-05-01T00:00:00Z"},{"publication":"42nd International Conference on Machine Learning","acknowledgement":"This work was supported by Hasler Foundation Program: Hasler Responsible AI (project number 21043). The research was also sponsored by the Army Research Office and was accomplished under Grant Number W911NF-24-1-0048. This work was further funded by the Swiss National Science Foundation (SNSF) under grant number 200021_205011. We also acknowledge project A11 of the Swiss National Supercomputing Centre (CSCS) for providing computing resources. Dan Alistarh and Ilia Markov were supported in part through the ERC Proofof-Concept grant FastML (Grant Agreement 101158077). Ali Ramezani-Kebrya was supported by the Research Council of Norway through FRIPRO Grant under project number 356103, its Centres of Excellence scheme, Integreat - Norwegian Centre for knowledge-driven machine learning under\r\nproject number 332645 - and its Centre for Research-based Innovation funding scheme (Visual Intelligence under grant no. 309439).","date_updated":"2025-12-16T12:46:54Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","author":[{"full_name":"Nguyen, Anh Duc","first_name":"Anh Duc","last_name":"Nguyen"},{"full_name":"Markov, Ilia","first_name":"Ilia","id":"D0CF4148-C985-11E9-8066-0BDEE5697425","last_name":"Markov"},{"full_name":"Wu, Frank Zhengqing","first_name":"Frank Zhengqing","last_name":"Wu"},{"last_name":"Ramezani-Kebrya","first_name":"Ali","full_name":"Ramezani-Kebrya, Ali"},{"first_name":"Kimon","full_name":"Antonakopoulos, Kimon","last_name":"Antonakopoulos"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"},{"full_name":"Cevher, Volkan","first_name":"Volkan","last_name":"Cevher"}],"publication_identifier":{"eissn":["2640-3498"]},"oa_version":"Published Version","arxiv":1,"file":[{"file_size":756213,"access_level":"open_access","file_name":"2025_ICML_Nguyen.pdf","content_type":"application/pdf","file_id":"20830","creator":"dernst","checksum":"a7edf0e4304171a3e035842b3aab1704","success":1,"date_created":"2025-12-16T12:45:41Z","relation":"main_file","date_updated":"2025-12-16T12:45:41Z"}],"alternative_title":["PMLR"],"intvolume":"       267","publication_status":"published","date_published":"2025-05-01T00:00:00Z","department":[{"_id":"DaAl"}],"page":"46026-46072","type":"conference","day":"01","file_date_updated":"2025-12-16T12:45:41Z","year":"2025","has_accepted_license":"1","scopus_import":"1","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"Layer-wise quantization for quantized optimistic dual averaging","publisher":"ML Research Press","abstract":[{"lang":"eng","text":"Modern deep neural networks exhibit heterogeneity across numerous layers of various types such as residuals, multi-head attention, etc., due to varying structures (dimensions, activation functions, etc.), distinct representation characteristics, which impact predictions. We develop a general layer-wise quantization framework with tight variance and code-length bounds, adapting to the heterogeneities over the course of training. We then apply a new layer-wise quantization technique within distributed variational inequalities (VIs), proposing a novel Quantized Optimistic Dual Averaging (QODA) algorithm with adaptive learning rates, which achieves competitive convergence rates for monotone VIs. We empirically show that QODA achieves up to a 150% speedup over the baselines in end-to-end training time for training Wasserstein GAN on 12+GPUs."}],"month":"05","citation":{"mla":"Nguyen, Anh Duc, et al. “Layer-Wise Quantization for Quantized Optimistic Dual Averaging.” <i>42nd International Conference on Machine Learning</i>, vol. 267, ML Research Press, 2025, pp. 46026–72.","ieee":"A. D. Nguyen <i>et al.</i>, “Layer-wise quantization for quantized optimistic dual averaging,” in <i>42nd International Conference on Machine Learning</i>, Vancouver, Canada, 2025, vol. 267, pp. 46026–46072.","ama":"Nguyen AD, Markov I, Wu FZ, et al. Layer-wise quantization for quantized optimistic dual averaging. In: <i>42nd International Conference on Machine Learning</i>. Vol 267. ML Research Press; 2025:46026-46072.","apa":"Nguyen, A. D., Markov, I., Wu, F. Z., Ramezani-Kebrya, A., Antonakopoulos, K., Alistarh, D.-A., &#38; Cevher, V. (2025). Layer-wise quantization for quantized optimistic dual averaging. In <i>42nd International Conference on Machine Learning</i> (Vol. 267, pp. 46026–46072). Vancouver, Canada: ML Research Press.","short":"A.D. Nguyen, I. Markov, F.Z. Wu, A. Ramezani-Kebrya, K. Antonakopoulos, D.-A. Alistarh, V. Cevher, in:, 42nd International Conference on Machine Learning, ML Research Press, 2025, pp. 46026–46072.","chicago":"Nguyen, Anh Duc, Ilia Markov, Frank Zhengqing Wu, Ali Ramezani-Kebrya, Kimon Antonakopoulos, Dan-Adrian Alistarh, and Volkan Cevher. “Layer-Wise Quantization for Quantized Optimistic Dual Averaging.” In <i>42nd International Conference on Machine Learning</i>, 267:46026–72. ML Research Press, 2025.","ista":"Nguyen AD, Markov I, Wu FZ, Ramezani-Kebrya A, Antonakopoulos K, Alistarh D-A, Cevher V. 2025. Layer-wise quantization for quantized optimistic dual averaging. 42nd International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 267, 46026–46072."},"status":"public","project":[{"_id":"8e35c14b-16d5-11f0-9cad-a3fc35339161","grant_number":"101158077","name":"FastML: Efficient and Cost-Effective Distributed Machine Learning"}],"date_created":"2025-12-14T23:02:06Z","OA_place":"publisher","volume":267,"language":[{"iso":"eng"}],"_id":"20821","article_processing_charge":"No","quality_controlled":"1","ddc":["000"],"conference":{"end_date":"2025-07-19","start_date":"2025-07-13","name":"ICML: International Conference on Machine Learning","location":"Vancouver, Canada"},"OA_type":"gold","external_id":{"arxiv":["2505.14371"]},"oa":1},{"date_created":"2026-02-16T15:41:15Z","status":"public","citation":{"short":"D.-A. Alistarh, F. Ellen, A. Fedorov, in:, 39th International Symposium on Distributed Computing, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, 2025, p. 3:1-3:16.","chicago":"Alistarh, Dan-Adrian, Faith Ellen, and Alexander Fedorov. “An Almost-Logarithmic Lower Bound for Leader Election with Bounded Value Contention.” In <i>39th International Symposium on Distributed Computing</i>, 356:3:1-3:16. Schloss Dagstuhl - Leibniz-Zentrum für Informatik, 2025. <a href=\"https://doi.org/10.4230/LIPIcs.DISC.2025.3\">https://doi.org/10.4230/LIPIcs.DISC.2025.3</a>.","ista":"Alistarh D-A, Ellen F, Fedorov A. 2025. An almost-logarithmic lower bound for leader election with bounded value contention. 39th International Symposium on Distributed Computing. DISC: Symposium on Distributed Computing, LIPIcs, vol. 356, 3:1-3:16.","ama":"Alistarh D-A, Ellen F, Fedorov A. An almost-logarithmic lower bound for leader election with bounded value contention. In: <i>39th International Symposium on Distributed Computing</i>. Vol 356. Schloss Dagstuhl - Leibniz-Zentrum für Informatik; 2025:3:1-3:16. doi:<a href=\"https://doi.org/10.4230/LIPIcs.DISC.2025.3\">10.4230/LIPIcs.DISC.2025.3</a>","mla":"Alistarh, Dan-Adrian, et al. “An Almost-Logarithmic Lower Bound for Leader Election with Bounded Value Contention.” <i>39th International Symposium on Distributed Computing</i>, vol. 356, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, 2025, p. 3:1-3:16, doi:<a href=\"https://doi.org/10.4230/LIPIcs.DISC.2025.3\">10.4230/LIPIcs.DISC.2025.3</a>.","ieee":"D.-A. Alistarh, F. Ellen, and A. Fedorov, “An almost-logarithmic lower bound for leader election with bounded value contention,” in <i>39th International Symposium on Distributed Computing</i>, Berlin, Germany, 2025, vol. 356, p. 3:1-3:16.","apa":"Alistarh, D.-A., Ellen, F., &#38; Fedorov, A. (2025). An almost-logarithmic lower bound for leader election with bounded value contention. In <i>39th International Symposium on Distributed Computing</i> (Vol. 356, p. 3:1-3:16). Berlin, Germany: Schloss Dagstuhl - Leibniz-Zentrum für Informatik. <a href=\"https://doi.org/10.4230/LIPIcs.DISC.2025.3\">https://doi.org/10.4230/LIPIcs.DISC.2025.3</a>"},"month":"10","language":[{"iso":"eng"}],"volume":356,"doi":"10.4230/LIPIcs.DISC.2025.3","OA_place":"publisher","ddc":["000"],"_id":"21250","article_processing_charge":"Yes","quality_controlled":"1","oa":1,"OA_type":"gold","conference":{"location":"Berlin, Germany","end_date":"2025-10-31","start_date":"2025-10-27","name":"DISC: Symposium on Distributed Computing"},"year":"2025","has_accepted_license":"1","publisher":"Schloss Dagstuhl - Leibniz-Zentrum für Informatik","title":"An almost-logarithmic lower bound for leader election with bounded value contention","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"abstract":[{"text":"We investigate the step complexity of the Leader Election problem (and implementing the corresponding test-and-set object) in asynchronous shared memory, where processes communicate through registers supporting atomic read and write and must coordinate so that a single process becomes the leader. Determining tight step complexity bounds for solving this problem is one of the key open problems in the theory of shared memory distributed computing. The best known algorithm is a randomized tournament-tree, which has worst-case expected step complexity O(log N) for N processes. There are provably no deterministic wait-free algorithms, and only restricted lower bounds are known for obstruction-free and randomized wait-free algorithms. We introduce a new lower bound that establishes an Ω((log N)/(log log N + log Q)) step complexity for any obstruction-free Leader Election algorithm, where N is the number of processes, and 2 ≤ Q ≤ N is a bound on the value contention, which we define as the maximum number of different values that processes can be simultaneously poised to write to the same register in any execution of the algorithm. Our result is strictly stronger than previous bounds based on write contention. In particular, it implies new lower bounds on step complexity that depend on register size.","lang":"eng"}],"publication_status":"published","intvolume":"       356","alternative_title":["LIPIcs"],"file":[{"date_created":"2026-02-18T06:46:02Z","success":1,"date_updated":"2026-02-18T06:46:02Z","relation":"main_file","file_name":"2025_LIPIcs_Alistarh.pdf","access_level":"open_access","file_size":1492189,"file_id":"21310","content_type":"application/pdf","creator":"dernst","checksum":"3825a0e6e6a05503e842a59f95528bd9"}],"date_published":"2025-10-22T00:00:00Z","day":"22","file_date_updated":"2026-02-18T06:46:02Z","type":"conference","page":"3:1-3:16","department":[{"_id":"DaAl"},{"_id":"GradSch"}],"publication":"39th International Symposium on Distributed Computing","corr_author":"1","date_updated":"2026-02-18T06:49:38Z","acknowledgement":"The work of Dan Alistarh is supported by grants from ERC, Austrian FWF, and the Google and NVIDIA corporations. Faith Ellen was supported in part by the Natural Science and Engineering Research Council of Canada (NSERC) grant RGPIN-2020-04178.","author":[{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian"},{"last_name":"Ellen","first_name":"Faith","full_name":"Ellen, Faith"},{"id":"2e711909-896a-11ed-bdf8-eb0f5a2984c6","last_name":"Fedorov","first_name":"Alexander","full_name":"Fedorov, Alexander"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","oa_version":"Published Version"},{"editor":[{"full_name":"Passban, Peyman","first_name":"Peyman","last_name":"Passban"},{"full_name":"Way, Andy","first_name":"Andy","last_name":"Way"},{"full_name":"Rezagholizadeh, Mehdi","first_name":"Mehdi","last_name":"Rezagholizadeh"}],"alternative_title":["Machine Translation: Technologies and Applications"],"publication_status":"published","type":"book_chapter","page":"83-97","department":[{"_id":"DaAl"},{"_id":"GradSch"}],"day":"05","date_published":"2025-07-05T00:00:00Z","date_updated":"2026-02-19T09:26:54Z","acknowledgement":"We would like to thank Eugenia Iofinova for useful comments on an earlier version of this draft, and Artur Niederfahrenhorst for useful suggestions regarding fine-tuning on the GSM8k dataset.","corr_author":"1","publication":"Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques","arxiv":1,"oa_version":"Preprint","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2310.06927"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","publication_identifier":{"isbn":["9783031857461"],"eissn":["2522-803X"],"eisbn":["9783031857478"],"issn":["2522-8021"]},"author":[{"id":"47beb3a5-07b5-11eb-9b87-b108ec578218","last_name":"Kurtic","first_name":"Eldar","full_name":"Kurtic, Eldar"},{"full_name":"Kuznedelev, Denis","first_name":"Denis","last_name":"Kuznedelev"},{"last_name":"Frantar","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","first_name":"Elias","full_name":"Frantar, Elias"},{"full_name":"Goinv, Michael","first_name":"Michael","last_name":"Goinv"},{"first_name":"Shubhra","full_name":"Pandit, Shubhra","last_name":"Pandit"},{"first_name":"Abhinav","full_name":"Agarwalla, Abhinav","last_name":"Agarwalla"},{"first_name":"Tuan","full_name":"Nguyen, Tuan","last_name":"Nguyen"},{"last_name":"Marques","first_name":"Alexandre","full_name":"Marques, Alexandre"},{"last_name":"Kurtz","full_name":"Kurtz, Mark","first_name":"Mark"},{"first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X"}],"OA_place":"repository","doi":"10.1007/978-3-031-85747-8_6","language":[{"iso":"eng"}],"citation":{"ieee":"E. Kurtic <i>et al.</i>, “Sparse Fine-Tuning for Inference Acceleration of Large Language Models,” in <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i>, P. Passban, A. Way, and M. Rezagholizadeh, Eds. Springer Nature, 2025, pp. 83–97.","mla":"Kurtic, Eldar, et al. “Sparse Fine-Tuning for Inference Acceleration of Large Language Models.” <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i>, edited by Peyman Passban et al., Springer Nature, 2025, pp. 83–97, doi:<a href=\"https://doi.org/10.1007/978-3-031-85747-8_6\">10.1007/978-3-031-85747-8_6</a>.","ama":"Kurtic E, Kuznedelev D, Frantar E, et al. Sparse Fine-Tuning for Inference Acceleration of Large Language Models. In: Passban P, Way A, Rezagholizadeh M, eds. <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i>. Springer Nature; 2025:83-97. doi:<a href=\"https://doi.org/10.1007/978-3-031-85747-8_6\">10.1007/978-3-031-85747-8_6</a>","apa":"Kurtic, E., Kuznedelev, D., Frantar, E., Goinv, M., Pandit, S., Agarwalla, A., … Alistarh, D.-A. (2025). Sparse Fine-Tuning for Inference Acceleration of Large Language Models. In P. Passban, A. Way, &#38; M. Rezagholizadeh (Eds.), <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i> (pp. 83–97). Springer Nature. <a href=\"https://doi.org/10.1007/978-3-031-85747-8_6\">https://doi.org/10.1007/978-3-031-85747-8_6</a>","short":"E. Kurtic, D. Kuznedelev, E. Frantar, M. Goinv, S. Pandit, A. Agarwalla, T. Nguyen, A. Marques, M. Kurtz, D.-A. Alistarh, in:, P. Passban, A. Way, M. Rezagholizadeh (Eds.), Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques, Springer Nature, 2025, pp. 83–97.","ista":"Kurtic E, Kuznedelev D, Frantar E, Goinv M, Pandit S, Agarwalla A, Nguyen T, Marques A, Kurtz M, Alistarh D-A. 2025.Sparse Fine-Tuning for Inference Acceleration of Large Language Models. In: Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques. Machine Translation: Technologies and Applications, , 83–97.","chicago":"Kurtic, Eldar, Denis Kuznedelev, Elias Frantar, Michael Goinv, Shubhra Pandit, Abhinav Agarwalla, Tuan Nguyen, Alexandre Marques, Mark Kurtz, and Dan-Adrian Alistarh. “Sparse Fine-Tuning for Inference Acceleration of Large Language Models.” In <i>Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques</i>, edited by Peyman Passban, Andy Way, and Mehdi Rezagholizadeh, 83–97. Springer Nature, 2025. <a href=\"https://doi.org/10.1007/978-3-031-85747-8_6\">https://doi.org/10.1007/978-3-031-85747-8_6</a>."},"month":"07","date_created":"2026-02-16T15:57:53Z","status":"public","OA_type":"green","external_id":{"arxiv":["2310.06927"]},"oa":1,"article_processing_charge":"No","_id":"21257","quality_controlled":"1","year":"2025","abstract":[{"lang":"eng","text":"We investigate the problem of accurate sparse fine-tuning of large language models (LLMs), that is, fine-tuning pre-trained LLMs on specialized tasks, while inducing sparsity in their weights. Our work is motivated by experiments showing that standard loss-based fine-tuning methods are not able to achieve high accuracy in this setting, especially at high sparsity targets. To address this issue, we perform a detailed study of knowledge distillation losses for fine-tuning of sparse models. We determine an L2-based distillation approach that we term ‘SquareHead’, which enables accurate recovery even at higher sparsities. Investigating the question of efficient inference, we show that sparse LLMs can be executed faster by taking advantage of sparsity. Specifically, we exhibit end-to-end results showing speedups enabled by sparsity, while recovering accuracy, on the following models and tasks, respectively: T5 for language translation, Whisper for speech translation, and open GPT-type models such as the Mosaic Pre-Trained Transformer (MPT) and Llama-2 models for text generation. In particular, for popular generative tasks, we show for the first time that sparse fine-tuning can reach 75% sparsity without drops in accuracy, and provide notable end-to-end speedups for inference on CPUs. Moreover, we also highlight that sparsity is compatible with other compression approaches, such as quantization."}],"title":"Sparse Fine-Tuning for Inference Acceleration of Large Language Models","publisher":"Springer Nature"},{"abstract":[{"lang":"eng","text":"Distributed optimization is the standard way of speeding up machine learning training, and most of the research in the area focuses on distributed first-order, gradient-based methods. Yet, there are settings where some computationally-bounded nodes may not be able to implement first-order, gradient-based optimization, while they could still contribute to joint optimization tasks. In this paper, we initiate the study of hybrid decentralized optimization, studying settings where nodes with zeroth-order and first-order optimization capabilities co-exist in a distributed system, and attempt to jointly solve an optimization task over some data distribution. We essentially show that, under reasonable parameter settings, such a system can not only withstand noisier zeroth-order agents but can even benefit from integrating such agents into the optimization process, rather than ignoring their information. At the core of our approach is a new analysis of distributed optimization with noisy and possibly-biased gradient estimators, which may be of independent interest. Our results hold for both convex and non-convex objectives. Experimental results on standard optimization tasks confirm our analysis, showing that hybrid first-zeroth order optimization can be practical, even when training deep neural networks."}],"title":"Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence","publisher":"Association for the Advancement of Artificial Intelligence","ec_funded":1,"year":"2025","scopus_import":"1","OA_type":"free access","external_id":{"arxiv":["2210.07703"]},"oa":1,"_id":"19713","article_processing_charge":"No","quality_controlled":"1","doi":"10.1609/aaai.v39i19.34290","OA_place":"publisher","volume":39,"language":[{"iso":"eng"}],"month":"04","citation":{"apa":"Talaei, S., Ansaripour, M., Nadiradze, G., &#38; Alistarh, D.-A. (2025). Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence. <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>. Association for the Advancement of Artificial Intelligence. <a href=\"https://doi.org/10.1609/aaai.v39i19.34290\">https://doi.org/10.1609/aaai.v39i19.34290</a>","mla":"Talaei, Shayan, et al. “Hybrid Decentralized Optimization: Leveraging Both First- and Zeroth-Order Optimizers for Faster Convergence.” <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>, vol. 39, no. 19, Association for the Advancement of Artificial Intelligence, 2025, pp. 20778–86, doi:<a href=\"https://doi.org/10.1609/aaai.v39i19.34290\">10.1609/aaai.v39i19.34290</a>.","ieee":"S. Talaei, M. Ansaripour, G. Nadiradze, and D.-A. Alistarh, “Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence,” <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>, vol. 39, no. 19. Association for the Advancement of Artificial Intelligence, pp. 20778–20786, 2025.","ama":"Talaei S, Ansaripour M, Nadiradze G, Alistarh D-A. Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence. <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>. 2025;39(19):20778-20786. doi:<a href=\"https://doi.org/10.1609/aaai.v39i19.34290\">10.1609/aaai.v39i19.34290</a>","chicago":"Talaei, Shayan, Matin Ansaripour, Giorgi Nadiradze, and Dan-Adrian Alistarh. “Hybrid Decentralized Optimization: Leveraging Both First- and Zeroth-Order Optimizers for Faster Convergence.” <i>Proceedings of the 39th AAAI Conference on Artificial Intelligence</i>. Association for the Advancement of Artificial Intelligence, 2025. <a href=\"https://doi.org/10.1609/aaai.v39i19.34290\">https://doi.org/10.1609/aaai.v39i19.34290</a>.","ista":"Talaei S, Ansaripour M, Nadiradze G, Alistarh D-A. 2025. Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence. Proceedings of the 39th AAAI Conference on Artificial Intelligence. 39(19), 20778–20786.","short":"S. Talaei, M. Ansaripour, G. Nadiradze, D.-A. Alistarh, Proceedings of the 39th AAAI Conference on Artificial Intelligence 39 (2025) 20778–20786."},"issue":"19","status":"public","project":[{"name":"Elastic Coordination for Scalable Machine Learning","_id":"268A44D6-B435-11E9-9278-68D0E5697425","grant_number":"805223","call_identifier":"H2020"}],"date_created":"2025-05-19T14:15:35Z","oa_version":"Preprint","arxiv":1,"main_file_link":[{"open_access":"1","url":"https://doi.org/10.1609/aaai.v39i19.34290"}],"article_type":"original","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","author":[{"last_name":"Talaei","full_name":"Talaei, Shayan","first_name":"Shayan"},{"last_name":"Ansaripour","first_name":"Matin","full_name":"Ansaripour, Matin"},{"first_name":"Giorgi","full_name":"Nadiradze, Giorgi","id":"3279A00C-F248-11E8-B48F-1D18A9856A87","last_name":"Nadiradze","orcid":"0000-0001-5634-0731"},{"full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"publication_identifier":{"eissn":["2374-3468"],"issn":["2159-5399"]},"acknowledgement":"This project has received funding from the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement\r\nNo 805223 ScaleML). The authors would like to acknowledge Eugenia Iofinova for useful discussions during the inception of this project.","date_updated":"2026-02-16T12:34:44Z","corr_author":"1","publication":"Proceedings of the 39th AAAI Conference on Artificial Intelligence","related_material":{"link":[{"relation":"software","url":"https://github.com/ShayanTalaei/HDO"}]},"department":[{"_id":"DaAl"}],"type":"journal_article","page":"20778-20786","day":"11","date_published":"2025-04-11T00:00:00Z","intvolume":"        39","publication_status":"published"},{"OA_place":"publisher","doi":"10.1145/3710848.3710871","language":[{"iso":"eng"}],"month":"02","citation":{"apa":"Frantar, E., Castro, R. L., Chen, J., Hoefler, T., &#38; Alistarh, D.-A. (2025). MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. In <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i> (pp. 239–251). Las Vegas, NV, United States: Association for Computing Machinery. <a href=\"https://doi.org/10.1145/3710848.3710871\">https://doi.org/10.1145/3710848.3710871</a>","ieee":"E. Frantar, R. L. Castro, J. Chen, T. Hoefler, and D.-A. Alistarh, “MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models,” in <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i>, Las Vegas, NV, United States, 2025, pp. 239–251.","mla":"Frantar, Elias, et al. “MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models.” <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i>, Association for Computing Machinery, 2025, pp. 239–51, doi:<a href=\"https://doi.org/10.1145/3710848.3710871\">10.1145/3710848.3710871</a>.","ama":"Frantar E, Castro RL, Chen J, Hoefler T, Alistarh D-A. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. In: <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i>. Association for Computing Machinery; 2025:239-251. doi:<a href=\"https://doi.org/10.1145/3710848.3710871\">10.1145/3710848.3710871</a>","ista":"Frantar E, Castro RL, Chen J, Hoefler T, Alistarh D-A. 2025. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming. PPoPP: Symposium on Principles and Practice of Parallel Programming, 239–251.","chicago":"Frantar, Elias, Roberto L. Castro, Jiale Chen, Torsten Hoefler, and Dan-Adrian Alistarh. “MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models.” In <i>Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming</i>, 239–51. Association for Computing Machinery, 2025. <a href=\"https://doi.org/10.1145/3710848.3710871\">https://doi.org/10.1145/3710848.3710871</a>.","short":"E. Frantar, R.L. Castro, J. Chen, T. Hoefler, D.-A. Alistarh, in:, Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, Association for Computing Machinery, 2025, pp. 239–251."},"status":"public","date_created":"2025-06-23T13:51:58Z","conference":{"start_date":"2025-03-01","name":"PPoPP: Symposium on Principles and Practice of Parallel Programming","end_date":"2025-03-05","location":"Las Vegas, NV, United States"},"OA_type":"hybrid","external_id":{"isi":["001437826500019"],"arxiv":["2408.11743"]},"oa":1,"_id":"19877","article_processing_charge":"Yes (via OA deal)","quality_controlled":"1","ddc":["000"],"year":"2025","has_accepted_license":"1","scopus_import":"1","abstract":[{"lang":"eng","text":"As inference on Large Language Models (LLMs) emerges as an important workload in machine learning applications, model weight quantization has become a standard technique for efficient GPU deployment. Quantization not only reduces model size, but has also been shown to yield substantial speedups for single-user inference, due to reduced memory movement, with low accuracy impact. Yet, it remains a key open question whether speedups are achievable also in batched settings with multiple parallel clients, which are highly relevant for practical serving. It is unclear whether GPU kernels can be designed to remain practically memory-bound, while supporting the substantially increased compute requirements of batched workloads.\r\nIn this paper, we resolve this question positively by introducing a new design for Mixed-precision Auto-Regressive LINear kernels, called MARLIN. Concretely, given a model whose weights are compressed via quantization to, e.g., 4 bits per element, MARLIN shows that batchsizes up to 16-32 can be practically supported with close to maximum (4×) quantization speedup, and larger batchsizes up to 64-128 with gradually decreasing, but still significant, acceleration. MARLIN accomplishes this via a combination of techniques, such as asynchronous memory access, complex task scheduling and pipelining, and bespoke quantization support. Our experiments show that MARLIN's near-optimal performance on individual LLM layers across different scenarios can also lead to significant end-to-end LLM inference speedups (of up to 2.8×) when integrated with the popular vLLM open-source serving engine. Finally, we show that MARLIN is extensible to further compression techniques, like NVIDIA 2:4 sparsity, leading to additional speedups."}],"tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models","publisher":"Association for Computing Machinery","file":[{"file_name":"2025_PPoPP_Frantar.pdf","file_size":1330044,"access_level":"open_access","file_id":"19883","content_type":"application/pdf","creator":"dernst","checksum":"a0566ea3c168e8273501a5eb7d767cf8","success":1,"date_created":"2025-06-24T06:04:17Z","date_updated":"2025-06-24T06:04:17Z","relation":"main_file"}],"publication_status":"published","department":[{"_id":"DaAl"}],"page":"239-251","isi":1,"type":"conference","day":"28","file_date_updated":"2025-06-24T06:04:17Z","date_published":"2025-02-28T00:00:00Z","acknowledgement":"The authors would like to thank the Neural Magic team, in particular Michael Goin, Alexander Matveev, and Rob Shaw, for support with the vLLM integration. This research was supported in part by generous grants from NVIDIA and Google.","date_updated":"2025-09-30T13:41:57Z","corr_author":"1","publication":"Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","related_material":{"record":[{"relation":"software","status":"public","id":"19884"}]},"arxiv":1,"oa_version":"Published Version","user_id":"317138e5-6ab7-11ef-aa6d-ffef3953e345","author":[{"full_name":"Frantar, Elias","first_name":"Elias","last_name":"Frantar","id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f"},{"last_name":"Castro","first_name":"Roberto L.","full_name":"Castro, Roberto L."},{"orcid":"0000-0001-5337-5875","last_name":"Chen","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785","first_name":"Jiale","full_name":"Chen, Jiale"},{"first_name":"Torsten","full_name":"Hoefler, Torsten","last_name":"Hoefler"},{"full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X"}],"publication_identifier":{"isbn":["9798400714436"]}},{"publication_identifier":{"eissn":["1432-0452"],"issn":["0178-2770"]},"author":[{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"},{"orcid":"0000-0002-6432-6646","last_name":"Rybicki","id":"334EFD2E-F248-11E8-B48F-1D18A9856A87","first_name":"Joel","full_name":"Rybicki, Joel"},{"last_name":"Voitovych","full_name":"Voitovych, Sasha","first_name":"Sasha"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","article_type":"original","oa_version":"Published Version","arxiv":1,"related_material":{"record":[{"relation":"earlier_version","status":"public","id":"11844"}]},"publication":"Distributed Computing","corr_author":"1","acknowledgement":"We thank all anonymous reviewers for their helpful comments. We would also like to thank Jakob Solnerzik and Olivier Stietel for catching some errors in the proofs. Open Access funding enabled and organized by Projekt DEAL. We gratefully acknowledge funding from the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement No 805223 ScaleML).","date_updated":"2025-12-30T09:04:18Z","date_published":"2025-09-01T00:00:00Z","file_date_updated":"2025-12-30T09:03:55Z","day":"01","PlanS_conform":"1","department":[{"_id":"DaAl"}],"page":"207-245","type":"journal_article","isi":1,"publication_status":"published","file":[{"date_created":"2025-12-30T09:03:55Z","success":1,"date_updated":"2025-12-30T09:03:55Z","relation":"main_file","file_name":"2025_DistributedComp_Alistarh.pdf","file_size":770705,"access_level":"open_access","creator":"dernst","checksum":"2789c0fdfb58f64930f05f6ac2b3ca61","file_id":"20900","content_type":"application/pdf"}],"intvolume":"        38","publisher":"Springer Nature","title":"Near-optimal leader election in population protocols on graphs","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"abstract":[{"lang":"eng","text":"In the stochastic population protocol model, we are given a connected graph with n nodes, and in every time step, a scheduler samples an edge of the graph uniformly at random and the nodes connected by this edge interact. A fundamental task in this model is stable leader election, in which all nodes start in an identical state and the aim is to reach a configuration in which (1)\r\nexactly one node is elected as leader and (2) this node remains as the unique leader no matter what sequence of interactions follows. On cliques, the complexity of this problem has recently been settled: time-optimal protocols stabilize in (n log n) expected steps using (log log n) states, whereas protocols that use O(1) states require (n2) expected steps. In this work, we investigate the complexity of stable leader election on graphs. We provide the first non-trivial time lower bounds on general graphs, showing that, when moving beyond cliques, the complexity of stable leader election can range from O(1) to (n3) expected steps. We describe a protocol that is time-optimal on many graph families, but uses polynomially-many states. In contrast, we give a near-time-optimal protocol that uses only O(log2 n) states that is at most a factor O(log n) slower. Finally, we observe that for many graphs the constant-state protocol of Beauquier et al. [OPODIS 2013] is at most a factor O(n log n) slower than the fast polynomial-state protocol, and among constant-state protocols, this protocol has near-optimal average case complexity on dense random graphs."}],"ec_funded":1,"has_accepted_license":"1","year":"2025","scopus_import":"1","ddc":["510"],"quality_controlled":"1","_id":"19969","article_processing_charge":"Yes (via OA deal)","oa":1,"external_id":{"isi":["001518300400001"],"arxiv":["2205.12597"]},"OA_type":"hybrid","status":"public","project":[{"_id":"268A44D6-B435-11E9-9278-68D0E5697425","call_identifier":"H2020","grant_number":"805223","name":"Elastic Coordination for Scalable Machine Learning"}],"date_created":"2025-07-06T22:01:24Z","month":"09","citation":{"short":"D.-A. Alistarh, J. Rybicki, S. Voitovych, Distributed Computing 38 (2025) 207–245.","ista":"Alistarh D-A, Rybicki J, Voitovych S. 2025. Near-optimal leader election in population protocols on graphs. Distributed Computing. 38, 207–245.","chicago":"Alistarh, Dan-Adrian, Joel Rybicki, and Sasha Voitovych. “Near-Optimal Leader Election in Population Protocols on Graphs.” <i>Distributed Computing</i>. Springer Nature, 2025. <a href=\"https://doi.org/10.1007/s00446-025-00487-7\">https://doi.org/10.1007/s00446-025-00487-7</a>.","ieee":"D.-A. Alistarh, J. Rybicki, and S. Voitovych, “Near-optimal leader election in population protocols on graphs,” <i>Distributed Computing</i>, vol. 38. Springer Nature, pp. 207–245, 2025.","mla":"Alistarh, Dan-Adrian, et al. “Near-Optimal Leader Election in Population Protocols on Graphs.” <i>Distributed Computing</i>, vol. 38, Springer Nature, 2025, pp. 207–45, doi:<a href=\"https://doi.org/10.1007/s00446-025-00487-7\">10.1007/s00446-025-00487-7</a>.","ama":"Alistarh D-A, Rybicki J, Voitovych S. Near-optimal leader election in population protocols on graphs. <i>Distributed Computing</i>. 2025;38:207-245. doi:<a href=\"https://doi.org/10.1007/s00446-025-00487-7\">10.1007/s00446-025-00487-7</a>","apa":"Alistarh, D.-A., Rybicki, J., &#38; Voitovych, S. (2025). Near-optimal leader election in population protocols on graphs. <i>Distributed Computing</i>. Springer Nature. <a href=\"https://doi.org/10.1007/s00446-025-00487-7\">https://doi.org/10.1007/s00446-025-00487-7</a>"},"volume":38,"language":[{"iso":"eng"}],"OA_place":"publisher","doi":"10.1007/s00446-025-00487-7"},{"arxiv":1,"oa_version":"Published Version","author":[{"full_name":"Chen, Jiale","first_name":"Jiale","orcid":"0000-0001-5337-5875","last_name":"Chen","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785"},{"last_name":"Yao","id":"d3e02e50-48a8-11ee-8f62-c108061797fa","full_name":"Yao, Dingling","first_name":"Dingling"},{"last_name":"Pervez","id":"fca6d90c-d47f-11ee-bc87-93ff51604981","first_name":"Adeel A","full_name":"Pervez, Adeel A"},{"last_name":"Alistarh","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian"},{"last_name":"Locatello","orcid":"0000-0002-4850-0683","id":"26cfd52f-2483-11ee-8040-88983bcc06d4","first_name":"Francesco","full_name":"Locatello, Francesco"}],"publication_identifier":{"isbn":["9798331320850"]},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","corr_author":"1","date_updated":"2025-08-04T08:03:11Z","related_material":{"link":[{"relation":"software","url":"https://github.com/IST-DASLab/ScalableMNN"}]},"publication":"13th International Conference on Learning Representations","file_date_updated":"2025-07-22T07:58:22Z","day":"01","department":[{"_id":"DaAl"},{"_id":"FrLo"}],"type":"conference","page":"63716-63737","date_published":"2025-04-01T00:00:00Z","publication_status":"published","file":[{"date_created":"2025-07-22T07:58:22Z","success":1,"relation":"main_file","date_updated":"2025-07-22T07:58:22Z","access_level":"open_access","file_size":732745,"file_name":"2025_ICLR_Chen.pdf","file_id":"20065","content_type":"application/pdf","checksum":"64cfdb12ae3e4e8ba57b1403e1066776","creator":"dernst"}],"abstract":[{"lang":"eng","text":"We propose Scalable Mechanistic Neural Network (S-MNN), an enhanced neural network framework designed for scientific machine learning applications involving long temporal sequences. By reformulating the original Mechanistic Neural Network (MNN) (Pervez et al., 2024), we reduce the computational time and space complexities from cubic and quadratic with respect to the sequence length, respectively, to linear. This significant improvement enables efficient modeling of long-term dynamics without sacrificing accuracy or interpretability. Extensive experiments demonstrate that S-MNN matches the original MNN in precision while substantially reducing computational resources. Consequently, S-MNN can drop-in replace the original MNN in applications, providing a practical and efficient tool for integrating mechanistic bottlenecks into neural network models of complex dynamical systems. Source code is available at https://github.com/IST-DASLab/ScalableMNN."}],"publisher":"ICLR","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"Scalable mechanistic neural networks","year":"2025","has_accepted_license":"1","scopus_import":"1","oa":1,"conference":{"location":"Singapore, Singapore","start_date":"2025-04-24","name":"ICLR: International Conference on Learning Representations","end_date":"2025-04-28"},"external_id":{"arxiv":["2410.06074"]},"OA_type":"diamond","ddc":["000"],"_id":"20032","quality_controlled":"1","article_processing_charge":"No","language":[{"iso":"eng"}],"OA_place":"publisher","status":"public","date_created":"2025-07-20T22:02:01Z","month":"04","citation":{"ama":"Chen J, Yao D, Pervez AA, Alistarh D-A, Locatello F. Scalable mechanistic neural networks. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:63716-63737.","mla":"Chen, Jiale, et al. “Scalable Mechanistic Neural Networks.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 63716–37.","ieee":"J. Chen, D. Yao, A. A. Pervez, D.-A. Alistarh, and F. Locatello, “Scalable mechanistic neural networks,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 63716–63737.","apa":"Chen, J., Yao, D., Pervez, A. A., Alistarh, D.-A., &#38; Locatello, F. (2025). Scalable mechanistic neural networks. In <i>13th International Conference on Learning Representations</i> (pp. 63716–63737). Singapore, Singapore: ICLR.","short":"J. Chen, D. Yao, A.A. Pervez, D.-A. Alistarh, F. Locatello, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 63716–63737.","chicago":"Chen, Jiale, Dingling Yao, Adeel A Pervez, Dan-Adrian Alistarh, and Francesco Locatello. “Scalable Mechanistic Neural Networks.” In <i>13th International Conference on Learning Representations</i>, 63716–37. ICLR, 2025.","ista":"Chen J, Yao D, Pervez AA, Alistarh D-A, Locatello F. 2025. Scalable mechanistic neural networks. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 63716–63737."}},{"ddc":["000"],"_id":"20034","quality_controlled":"1","article_processing_charge":"No","oa":1,"conference":{"start_date":"2025-04-24","name":"ICLR: International Conference on Learning Representations","end_date":"2025-04-28","location":"Singapore, Singapore"},"external_id":{"arxiv":["2410.16103"]},"OA_type":"diamond","status":"public","date_created":"2025-07-20T22:02:02Z","month":"04","citation":{"apa":"Robert, T., Safaryan, M., Modoranu, I.-V., &#38; Alistarh, D.-A. (2025). LDAdam: Adaptive optimization from low-dimensional gradient statistics. In <i>13th International Conference on Learning Representations</i> (pp. 101877–101913). Singapore, Singapore: ICLR.","ama":"Robert T, Safaryan M, Modoranu I-V, Alistarh D-A. LDAdam: Adaptive optimization from low-dimensional gradient statistics. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:101877-101913.","ieee":"T. Robert, M. Safaryan, I.-V. Modoranu, and D.-A. Alistarh, “LDAdam: Adaptive optimization from low-dimensional gradient statistics,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 101877–101913.","mla":"Robert, Thomas, et al. “LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 101877–913.","ista":"Robert T, Safaryan M, Modoranu I-V, Alistarh D-A. 2025. LDAdam: Adaptive optimization from low-dimensional gradient statistics. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 101877–101913.","chicago":"Robert, Thomas, Mher Safaryan, Ionut-Vlad Modoranu, and Dan-Adrian Alistarh. “LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics.” In <i>13th International Conference on Learning Representations</i>, 101877–913. ICLR, 2025.","short":"T. Robert, M. Safaryan, I.-V. Modoranu, D.-A. Alistarh, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 101877–101913."},"language":[{"iso":"eng"}],"OA_place":"publisher","publisher":"ICLR","title":"LDAdam: Adaptive optimization from low-dimensional gradient statistics","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"abstract":[{"lang":"eng","text":"We introduce LDAdam, a memory-efficient optimizer for training large models, that performs adaptive optimization steps within lower dimensional subspaces, while consistently exploring the full parameter space during training. This strategy keeps the optimizer's memory footprint to a fraction of the model size. LDAdam relies on a new projection-aware update rule for the optimizer states that allows for transitioning between subspaces, i.e., estimation of the statistics of the projected gradients. To mitigate the errors due to low-rank projection, LDAdam integrates a new generalized error feedback mechanism, which explicitly accounts for both gradient and optimizer state compression. We prove the convergence of LDAdam under standard assumptions, and provide empirical evidence that LDAdam allows for efficient fine-tuning and pre-training of language models."}],"year":"2025","has_accepted_license":"1","scopus_import":"1","date_published":"2025-04-01T00:00:00Z","day":"01","file_date_updated":"2025-08-04T08:39:51Z","department":[{"_id":"DaAl"}],"page":"101877-101913","type":"conference","publication_status":"published","file":[{"creator":"dernst","checksum":"9327d82569358d7bf1c3ec1a9952e721","content_type":"application/pdf","file_id":"20113","file_name":"2025_ICLR_Robert.pdf","file_size":1346111,"access_level":"open_access","date_updated":"2025-08-04T08:39:51Z","relation":"main_file","success":1,"date_created":"2025-08-04T08:39:51Z"}],"publication_identifier":{"isbn":["9798331320850"]},"author":[{"last_name":"Robert","first_name":"Thomas","full_name":"Robert, Thomas"},{"id":"dd546b39-0804-11ed-9c55-ef075c39778d","last_name":"Safaryan","full_name":"Safaryan, Mher","first_name":"Mher"},{"last_name":"Modoranu","id":"449f7a18-f128-11eb-9611-9b430c0c6333","first_name":"Ionut-Vlad","full_name":"Modoranu, Ionut-Vlad"},{"full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","arxiv":1,"oa_version":"Published Version","related_material":{"link":[{"url":"https://github.com/IST-DASLab/LDAdam","relation":"software"}]},"publication":"13th International Conference on Learning Representations","corr_author":"1","date_updated":"2025-08-04T08:41:10Z"},{"file":[{"file_name":"2025_ICLR_Sawmya.pdf","access_level":"open_access","file_size":5447177,"file_id":"20110","content_type":"application/pdf","creator":"dernst","checksum":"39a8fa7dbdd7029859e156f53f20f6bc","success":1,"date_created":"2025-08-04T08:14:09Z","date_updated":"2025-08-04T08:14:09Z","relation":"main_file"}],"publication_status":"published","department":[{"_id":"DaAl"}],"page":"26244-26274","type":"conference","file_date_updated":"2025-08-04T08:14:09Z","day":"01","date_published":"2025-04-01T00:00:00Z","acknowledgement":"The authors would like to extend their gratitude to Lori Leu for her insightful comments on the\r\napplication of the Wasserstein distance metric. We also wish to thank Elias Frantar for his help in\r\nworking with the SparseGPT implementation and his advice for the project. Additionally, we would like to thank Tony Tong Wang and Thomas Athey for their valuable feedback and constructive discussions.\r\nThis work was supported by an NIH Brains CONNECTS U01 grant and AMD’s AI & HPC Fund.","date_updated":"2025-08-04T08:16:43Z","corr_author":"1","publication":"13th International Conference on Learning Representations","related_material":{"link":[{"relation":"software","url":"https://github.com/Shavit-Lab/Sparse-Expansion"}]},"arxiv":1,"oa_version":"Published Version","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","author":[{"full_name":"Sawmya, Shashata","first_name":"Shashata","last_name":"Sawmya"},{"last_name":"Kong","first_name":"Linghao","full_name":"Kong, Linghao"},{"full_name":"Markov, Ilia","first_name":"Ilia","last_name":"Markov","id":"D0CF4148-C985-11E9-8066-0BDEE5697425"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian"},{"full_name":"Shavit, Nir","first_name":"Nir","last_name":"Shavit"}],"publication_identifier":{"isbn":["9798331320850"]},"OA_place":"publisher","language":[{"iso":"eng"}],"month":"04","citation":{"short":"S. Sawmya, L. Kong, I. Markov, D.-A. Alistarh, N. Shavit, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 26244–26274.","chicago":"Sawmya, Shashata, Linghao Kong, Ilia Markov, Dan-Adrian Alistarh, and Nir Shavit. “Wasserstein Distances, Neuronal Entanglement, and Sparsity.” In <i>13th International Conference on Learning Representations</i>, 26244–74. ICLR, 2025.","ista":"Sawmya S, Kong L, Markov I, Alistarh D-A, Shavit N. 2025. Wasserstein distances, neuronal entanglement, and sparsity. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 26244–26274.","ama":"Sawmya S, Kong L, Markov I, Alistarh D-A, Shavit N. Wasserstein distances, neuronal entanglement, and sparsity. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:26244-26274.","ieee":"S. Sawmya, L. Kong, I. Markov, D.-A. Alistarh, and N. Shavit, “Wasserstein distances, neuronal entanglement, and sparsity,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 26244–26274.","mla":"Sawmya, Shashata, et al. “Wasserstein Distances, Neuronal Entanglement, and Sparsity.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 26244–74.","apa":"Sawmya, S., Kong, L., Markov, I., Alistarh, D.-A., &#38; Shavit, N. (2025). Wasserstein distances, neuronal entanglement, and sparsity. In <i>13th International Conference on Learning Representations</i> (pp. 26244–26274). Singapore, Singapore: ICLR."},"status":"public","date_created":"2025-07-20T22:02:03Z","conference":{"name":"ICLR: International Conference on Learning Representations","start_date":"2025-04-24","end_date":"2025-04-28","location":"Singapore, Singapore"},"OA_type":"diamond","external_id":{"arxiv":["2405.15756"]},"oa":1,"_id":"20037","article_processing_charge":"No","quality_controlled":"1","ddc":["000"],"has_accepted_license":"1","year":"2025","scopus_import":"1","abstract":[{"lang":"eng","text":"Disentangling polysemantic neurons is at the core of many current approaches to interpretability of large language models. Here we attempt to study how disentanglement can be used to understand performance, particularly under weight sparsity, a leading post-training optimization technique. We suggest a novel measure for estimating neuronal entanglement: the Wasserstein distance of a neuron's output distribution to a Gaussian. Moreover, we show the existence of a small number of highly entangled \"Wasserstein Neurons\" in each linear layer of an LLM, characterized by their highly non-Gaussian output distributions, their role in mapping similar inputs to dissimilar outputs, and their significant impact on model accuracy. To study these phenomena, we propose a new experimental framework for disentangling polysemantic neurons. Our framework separates each layer's inputs to create a mixture of experts where each neuron's output is computed by a mixture of neurons of lower Wasserstein distance, each better at maintaining accuracy when sparsified without retraining. We provide strong evidence that this is because the mixture of sparse experts is effectively disentangling the input-output relationship of individual neurons, in particular the difficult Wasserstein neurons."}],"tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"Wasserstein distances, neuronal entanglement, and sparsity","publisher":"ICLR"},{"has_accepted_license":"1","year":"2025","scopus_import":"1","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws","publisher":"ICLR","abstract":[{"lang":"eng","text":"Pruning eliminates unnecessary parameters in neural networks; it offers a promising solution to the growing computational demands of large language models (LLMs). While many focus on post-training pruning, sparse pre-training--which combines pruning and pre-training into a single phase--provides a simpler alternative. In this work, we present the first systematic exploration of optimal sparse pre-training configurations for LLMs through an examination of 80 unique pruning schedules across different sparsity levels and training durations. We find that initiating pruning at 25% of total training compute and concluding at 75% achieves near-optimal final evaluation loss. These findings provide valuable insights for efficient and effective sparse pre-training of LLMs. Furthermore, we propose a new scaling law that modifies the Chinchilla scaling law to use the average parameter count over pre-training. Through empirical and theoretical validation, we demonstrate that this modified scaling law accurately models evaluation loss for both sparsely and densely pre-trained LLMs, unifying scaling laws across pre-training paradigms. Our findings indicate that while sparse pre-training achieves the same final model quality as dense pre-training for equivalent compute budgets, it provides substantial benefits through reduced model size, enabling significant potential computational savings during inference."}],"month":"04","citation":{"apa":"Jin, T., Humayun, A. I., Evci, U., Subramanian, S., Yazdanbakhsh, A., Alistarh, D.-A., &#38; Dziugaite, G. K. (2025). The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. In <i>13th International Conference on Learning Representations</i> (pp. 85165–85181). Singapore, Singapore: ICLR.","ieee":"T. Jin <i>et al.</i>, “The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 85165–85181.","mla":"Jin, Tian, et al. “The Journey Matters: Average Parameter Count over Pre-Training Unifies Sparse and Dense Scaling Laws.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 85165–81.","ama":"Jin T, Humayun AI, Evci U, et al. The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:85165-85181.","chicago":"Jin, Tian, Ahmed Imtiaz Humayun, Utku Evci, Suvinay Subramanian, Amir Yazdanbakhsh, Dan-Adrian Alistarh, and Gintare Karolina Dziugaite. “The Journey Matters: Average Parameter Count over Pre-Training Unifies Sparse and Dense Scaling Laws.” In <i>13th International Conference on Learning Representations</i>, 85165–81. ICLR, 2025.","ista":"Jin T, Humayun AI, Evci U, Subramanian S, Yazdanbakhsh A, Alistarh D-A, Dziugaite GK. 2025. The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 85165–85181.","short":"T. Jin, A.I. Humayun, U. Evci, S. Subramanian, A. Yazdanbakhsh, D.-A. Alistarh, G.K. Dziugaite, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 85165–85181."},"status":"public","date_created":"2025-07-20T22:02:03Z","OA_place":"publisher","language":[{"iso":"eng"}],"article_processing_charge":"No","_id":"20038","quality_controlled":"1","ddc":["000"],"conference":{"location":"Singapore, Singapore","end_date":"2025-04-28","name":"ICLR: International Conference on Learning Representations","start_date":"2025-04-24"},"external_id":{"arxiv":["2501.12486 "]},"OA_type":"diamond","oa":1,"publication":"13th International Conference on Learning Representations","acknowledgement":"We are deeply grateful to Elias Frantar, Naveen Kumar, Sanjiv Kumar, Daniel\r\nM. Roy, and Clemens Schaefer for their valuable feedback and thoughtful review of this paper.\r\nWe also acknowledge the critical support provided by the Google CoreML Performance Team, and Google Research during this project. We further recognize the extended team at Google DeepMind, who enabled and supported this research direction.\r\nThis work was in part supported by the Sloan Foundation, the MIT-IBM Watson AI Lab, Apple, and SRC JUMP 2.0 (CoCoSys).","date_updated":"2025-08-04T08:24:59Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","author":[{"first_name":"Tian","full_name":"Jin, Tian","last_name":"Jin"},{"first_name":"Ahmed Imtiaz","full_name":"Humayun, Ahmed Imtiaz","last_name":"Humayun"},{"full_name":"Evci, Utku","first_name":"Utku","last_name":"Evci"},{"first_name":"Suvinay","full_name":"Subramanian, Suvinay","last_name":"Subramanian"},{"last_name":"Yazdanbakhsh","full_name":"Yazdanbakhsh, Amir","first_name":"Amir"},{"full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh"},{"full_name":"Dziugaite, Gintare Karolina","first_name":"Gintare Karolina","last_name":"Dziugaite"}],"publication_identifier":{"isbn":["9798331320850"]},"arxiv":1,"oa_version":"Published Version","file":[{"relation":"main_file","date_updated":"2025-08-04T08:23:47Z","success":1,"date_created":"2025-08-04T08:23:47Z","creator":"dernst","checksum":"dbc27120e9aba67dffbd9e5d513a6803","file_id":"20111","content_type":"application/pdf","file_size":704989,"access_level":"open_access","file_name":"2025_ICLR_Jin.pdf"}],"publication_status":"published","date_published":"2025-04-01T00:00:00Z","department":[{"_id":"DaAl"}],"page":"85165-85181","type":"conference","day":"01","file_date_updated":"2025-08-04T08:23:47Z"},{"author":[{"last_name":"Martynov","first_name":"Pavel","full_name":"Martynov, Pavel"},{"full_name":"Buzdalov, Maxim","first_name":"Maxim","last_name":"Buzdalov"},{"full_name":"Pankratov, Sergei","first_name":"Sergei","id":"f773bf05-72ef-11ef-b75a-a383d22f454b","last_name":"Pankratov"},{"full_name":"Aksenov, Vitaliy","first_name":"Vitaliy","last_name":"Aksenov"},{"last_name":"Schmid","first_name":"Stefan","full_name":"Schmid, Stefan"}],"publication_identifier":{"isbn":["9798400714658"]},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","oa_version":"Published Version","publication":"Proceedings of the 2025 Genetic and Evolutionary Computation Conference","date_updated":"2025-12-01T12:35:24Z","acknowledgement":"Research was supported by the German Research Foundation (DFG), grant 470029389 (FlexNets).","date_published":"2025-07-13T00:00:00Z","day":"13","file_date_updated":"2025-09-02T07:41:13Z","page":"249-257","isi":1,"type":"conference","department":[{"_id":"DaAl"}],"publication_status":"published","file":[{"success":1,"date_created":"2025-09-02T07:41:13Z","date_updated":"2025-09-02T07:41:13Z","relation":"main_file","file_name":"2025_GECCO_Martynov.pdf","access_level":"open_access","file_size":608996,"checksum":"7e513fa508cff7e8a0d33f50b1fe09af","creator":"dernst","file_id":"20273","content_type":"application/pdf"}],"publisher":"Association for Computing Machinery","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"In the search of optimal tree networks: Hardness and heuristics","abstract":[{"text":"Traffic in datacenters may follow some pattern: some pairs of servers communicate more frequently than others. Demand-oblivious networks may perform poorly for such workloads, and demand-aware networks optimized for traffic should be used instead. Unfortunately, not all shapes of networks are feasible in real hardware. Practical limitations are usually provided in the form of a topology. For example, a network may be required to be a binary tree, a bounded-degree graph or a Fat tree.\r\nIn this work, we consider a topology of a binary tree, one of the most fundamental network topologies. We show that already finding an optimal demand-aware binary tree network is NP-hard. Then, we explore how various optimization techniques, including simple local searches, as well as deterministic mutation and crossover operators, cope with generating efficient tree networks on real-life and synthetic workloads.","lang":"eng"}],"scopus_import":"1","year":"2025","has_accepted_license":"1","ddc":["000"],"_id":"20224","quality_controlled":"1","article_processing_charge":"Yes (in subscription journal)","oa":1,"OA_type":"hybrid","external_id":{"isi":["001556459900031"]},"conference":{"location":"Malaga, Spain","end_date":"2025-07-18","start_date":"2025-07-14","name":"GECCO: Genetic and evolutionary computation conference"},"date_created":"2025-08-24T22:01:31Z","status":"public","citation":{"ista":"Martynov P, Buzdalov M, Pankratov S, Aksenov V, Schmid S. 2025. In the search of optimal tree networks: Hardness and heuristics. Proceedings of the 2025 Genetic and Evolutionary Computation Conference. GECCO: Genetic and evolutionary computation conference, 249–257.","chicago":"Martynov, Pavel, Maxim Buzdalov, Sergei Pankratov, Vitaliy Aksenov, and Stefan Schmid. “In the Search of Optimal Tree Networks: Hardness and Heuristics.” In <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i>, 249–57. Association for Computing Machinery, 2025. <a href=\"https://doi.org/10.1145/3712256.3726425\">https://doi.org/10.1145/3712256.3726425</a>.","short":"P. Martynov, M. Buzdalov, S. Pankratov, V. Aksenov, S. Schmid, in:, Proceedings of the 2025 Genetic and Evolutionary Computation Conference, Association for Computing Machinery, 2025, pp. 249–257.","apa":"Martynov, P., Buzdalov, M., Pankratov, S., Aksenov, V., &#38; Schmid, S. (2025). In the search of optimal tree networks: Hardness and heuristics. In <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i> (pp. 249–257). Malaga, Spain: Association for Computing Machinery. <a href=\"https://doi.org/10.1145/3712256.3726425\">https://doi.org/10.1145/3712256.3726425</a>","ama":"Martynov P, Buzdalov M, Pankratov S, Aksenov V, Schmid S. In the search of optimal tree networks: Hardness and heuristics. In: <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i>. Association for Computing Machinery; 2025:249-257. doi:<a href=\"https://doi.org/10.1145/3712256.3726425\">10.1145/3712256.3726425</a>","ieee":"P. Martynov, M. Buzdalov, S. Pankratov, V. Aksenov, and S. Schmid, “In the search of optimal tree networks: Hardness and heuristics,” in <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i>, Malaga, Spain, 2025, pp. 249–257.","mla":"Martynov, Pavel, et al. “In the Search of Optimal Tree Networks: Hardness and Heuristics.” <i>Proceedings of the 2025 Genetic and Evolutionary Computation Conference</i>, Association for Computing Machinery, 2025, pp. 249–57, doi:<a href=\"https://doi.org/10.1145/3712256.3726425\">10.1145/3712256.3726425</a>."},"month":"07","language":[{"iso":"eng"}],"doi":"10.1145/3712256.3726425","OA_place":"publisher"},{"publisher":"Association for Computational Linguistics","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"“Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization","abstract":[{"lang":"eng","text":"Quantization is a powerful tool for accelerating large language model (LLM) inference, but the accuracy-performance trade-offs across different formats remain unclear. In this paper, we conduct the most comprehensive empirical study to date, evaluating FP8, INT8, and INT4\r\nquantization across academic benchmarks and real-world tasks on the entire Llama-3.1 model\r\nfamily. Through over 500,000 evaluations, our investigation yields several key findings: (1) FP8 (W8A8-FP) is effectively lossless across all model scales, (2) well-tuned INT8 (W8A8-INT) achieves surprisingly low (1-3%) accuracy degradation, and (3) INT4 weightonly (W4A16-INT) is more competitive than expected, rivaling 8-bit quantization. Further, we investigate the optimal quantization format for different deployments by analyzing inference performance through the popular vLLM framework. Our analysis provides clear deployment recommendations: W4A16 is the most cost-efficient for synchronous setups, while W8A8 dominates in asynchronous\r\ncontinuous batching. For mixed workloads, the optimal choice depends on the specific use\r\ncase. Our findings offer practical, data-driven guidelines for deploying quantized LLMs at scale—ensuring the best balance between speed, efficiency, and accuracy. "}],"scopus_import":"1","year":"2025","has_accepted_license":"1","ddc":["000"],"article_processing_charge":"No","_id":"20684","quality_controlled":"1","oa":1,"external_id":{"arxiv":["2411.02355"]},"OA_type":"gold","conference":{"start_date":"2025-07-27","name":"ACL: Meeting of the Association for Computational Linguistics","end_date":"2025-08-01","location":"Vienna, Austria"},"date_created":"2025-11-24T14:20:46Z","status":"public","citation":{"mla":"Kurtic, Eldar, et al. “‘Give Me BF16 or Give Me Death’? Accuracy-Performance Trade-Offs in LLM Quantization.” <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i>, Association for Computational Linguistics, 2025, pp. 26872–86.","ieee":"E. Kurtic, A. Marques, S. Pandit, M. Kurtz, and D.-A. Alistarh, “‘Give me BF16 or give me death’? Accuracy-performance trade-offs in LLM quantization,” in <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i>, Vienna, Austria, 2025, pp. 26872–26886.","ama":"Kurtic E, Marques A, Pandit S, Kurtz M, Alistarh D-A. “Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization. In: <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i>. Association for Computational Linguistics; 2025:26872-26886.","apa":"Kurtic, E., Marques, A., Pandit, S., Kurtz, M., &#38; Alistarh, D.-A. (2025). “Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization. In <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i> (pp. 26872–26886). Vienna, Austria: Association for Computational Linguistics.","short":"E. Kurtic, A. Marques, S. Pandit, M. Kurtz, D.-A. Alistarh, in:, Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, 2025, pp. 26872–26886.","ista":"Kurtic E, Marques A, Pandit S, Kurtz M, Alistarh D-A. 2025. “Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization. Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics. ACL: Meeting of the Association for Computational Linguistics, 26872–26886.","chicago":"Kurtic, Eldar, Alexandre Marques, Shubhra Pandit, Mark Kurtz, and Dan-Adrian Alistarh. “‘Give Me BF16 or Give Me Death’? Accuracy-Performance Trade-Offs in LLM Quantization.” In <i>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</i>, 26872–86. Association for Computational Linguistics, 2025."},"month":"08","language":[{"iso":"eng"}],"OA_place":"publisher","author":[{"last_name":"Kurtic","id":"47beb3a5-07b5-11eb-9b87-b108ec578218","full_name":"Kurtic, Eldar","first_name":"Eldar"},{"first_name":"Alexandre","full_name":"Marques, Alexandre","last_name":"Marques"},{"last_name":"Pandit","first_name":"Shubhra","full_name":"Pandit, Shubhra"},{"full_name":"Kurtz, Mark","first_name":"Mark","last_name":"Kurtz"},{"first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"publication_identifier":{"issn":["0736-587X"],"isbn":["9798891762510"]},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","oa_version":"Published Version","arxiv":1,"publication":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics","corr_author":"1","date_updated":"2025-11-26T11:15:11Z","date_published":"2025-08-01T00:00:00Z","day":"01","file_date_updated":"2025-11-26T11:06:57Z","type":"conference","page":"26872-26886","department":[{"_id":"DaAl"}],"publication_status":"published","file":[{"file_size":417450,"access_level":"open_access","file_name":"2025_ACL_Kurtic.pdf","content_type":"application/pdf","file_id":"20698","checksum":"4c066ee20f9ab17619c95652c0eb75f1","creator":"dernst","success":1,"date_created":"2025-11-26T11:06:57Z","relation":"main_file","date_updated":"2025-11-26T11:06:57Z"}]},{"department":[{"_id":"BiCh"},{"_id":"DaAl"}],"page":"11427-11435","type":"journal_article","isi":1,"day":"31","date_published":"2025-10-31T00:00:00Z","intvolume":"        21","publication_status":"published","oa_version":"None","article_type":"original","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","author":[{"id":"6e5644c0-c180-11ed-a2da-facc4c9f4f09","last_name":"Tuo","full_name":"Tuo, Ping","first_name":"Ping"},{"id":"54a2c730-803f-11ed-ab7e-95b29d2680e7","orcid":"0000-0001-5126-4928","last_name":"Zeng","full_name":"Zeng, Zezhu","first_name":"Zezhu"},{"first_name":"Jiale","full_name":"Chen, Jiale","last_name":"Chen","orcid":"0000-0001-5337-5875","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785"},{"first_name":"Bingqing","full_name":"Cheng, Bingqing","id":"cbe3cda4-d82c-11eb-8dc7-8ff94289fcc9","orcid":"0000-0002-3584-9632","last_name":"Cheng"}],"publication_identifier":{"eissn":["1549-9626"],"issn":["1549-9618"]},"acknowledgement":"P.T. acknowledges funding from FFG MAGNIFICO and the BIDMaP Postdoctoral Fellowship. Z.Z. acknowledges funding from the European Union’s Horizon 2020 research and innovation program under the Marie Skłodowska-Curie grant agreement No. 101034413. The authors acknowledge the research computing facilities provided by the Institute of Science and Technology Austria (ISTA), and resources of the National Energy Research Scientific Computing Center (NERSC), a Department of Energy Office of Science User Facility using NERSC award DOEERCAP0031751 ’GenAI@NERSC’. P.T. acknowledges valued discussions with Dr. Daniel King, Dr. Lei Wang, and Dr. Fuzhi Dai.","date_updated":"2025-12-01T15:40:27Z","corr_author":"1","publication":"Journal of Chemical Theory and Computation","acknowledged_ssus":[{"_id":"ScienComp"}],"related_material":{"link":[{"relation":"software","url":"https://github.com/tuoping/alchemicalFES"}]},"OA_type":"closed access","external_id":{"pmid":["41172130"],"isi":["001605927900001"]},"article_processing_charge":"No","_id":"20704","quality_controlled":"1","doi":"10.1021/acs.jctc.5c01248","volume":21,"language":[{"iso":"eng"}],"month":"10","citation":{"short":"P. Tuo, Z. Zeng, J. Chen, B. Cheng, Journal of Chemical Theory and Computation 21 (2025) 11427–11435.","chicago":"Tuo, Ping, Zezhu Zeng, Jiale Chen, and Bingqing Cheng. “Scalable Multitemperature Free Energy Sampling of Classical Ising Spin States.” <i>Journal of Chemical Theory and Computation</i>. American Chemical Society, 2025. <a href=\"https://doi.org/10.1021/acs.jctc.5c01248\">https://doi.org/10.1021/acs.jctc.5c01248</a>.","ista":"Tuo P, Zeng Z, Chen J, Cheng B. 2025. Scalable multitemperature free energy sampling of classical Ising spin states. Journal of Chemical Theory and Computation. 21(22), 11427–11435.","ama":"Tuo P, Zeng Z, Chen J, Cheng B. Scalable multitemperature free energy sampling of classical Ising spin states. <i>Journal of Chemical Theory and Computation</i>. 2025;21(22):11427-11435. doi:<a href=\"https://doi.org/10.1021/acs.jctc.5c01248\">10.1021/acs.jctc.5c01248</a>","mla":"Tuo, Ping, et al. “Scalable Multitemperature Free Energy Sampling of Classical Ising Spin States.” <i>Journal of Chemical Theory and Computation</i>, vol. 21, no. 22, American Chemical Society, 2025, pp. 11427–35, doi:<a href=\"https://doi.org/10.1021/acs.jctc.5c01248\">10.1021/acs.jctc.5c01248</a>.","ieee":"P. Tuo, Z. Zeng, J. Chen, and B. Cheng, “Scalable multitemperature free energy sampling of classical Ising spin states,” <i>Journal of Chemical Theory and Computation</i>, vol. 21, no. 22. American Chemical Society, pp. 11427–11435, 2025.","apa":"Tuo, P., Zeng, Z., Chen, J., &#38; Cheng, B. (2025). Scalable multitemperature free energy sampling of classical Ising spin states. <i>Journal of Chemical Theory and Computation</i>. American Chemical Society. <a href=\"https://doi.org/10.1021/acs.jctc.5c01248\">https://doi.org/10.1021/acs.jctc.5c01248</a>"},"issue":"22","status":"public","project":[{"name":"IST-BRIDGE: International postdoctoral program","grant_number":"101034413","call_identifier":"H2020","_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c"}],"date_created":"2025-11-30T23:02:06Z","abstract":[{"text":"Generative models have advanced significantly in sampling material systems with continuous variables, such as atomistic structures. However, their application to discrete variables, like atom types or spin states, remains underexplored. In this work, we introduce a discrete flow matching model, tailored for systems with discrete phase-space coordinates (e.g., the Ising model or a multicomponent system on a lattice). This approach enables a single model to sample free energy surfaces over a wide temperature range with minimal training overhead, and the model generation is scalable to larger lattice sizes than those in the training set. We demonstrate our approach on the 2D Ising model, showing efficient and reliable free energy sampling. These results highlight the potential of flow matching for low-cost, scalable free energy sampling in discrete systems and suggest promising extensions to alchemical degrees of freedom in crystalline materials. The codebase developed for this work is openly available at https://github.com/tuoping/alchemicalFES.","lang":"eng"}],"title":"Scalable multitemperature free energy sampling of classical Ising spin states","publisher":"American Chemical Society","ec_funded":1,"year":"2025","scopus_import":"1","pmid":1},{"abstract":[{"lang":"eng","text":"The recent surge in high-quality open-source Generative AI text models (colloquially: LLMs), as well as efficient finetuning techniques, have opened the possibility of creating high-quality personalized models that generate text attuned to a specific individual’s needs and are capable of credibly imitating their writing style by refining an open-source model with that person’s own data. The technology to create such models is accessible to private individuals, and training and running such models can be done cheaply on consumer-grade hardware. While these advancements are a huge gain for usability and privacy, this position paper argues that the practical feasibility of impersonating specific individuals also introduces novel safety risks. For instance, this technology enables the creation of phishing emails\r\nor fraudulent social media accounts, based on small amounts of publicly available text, or by the individuals themselves to escape AI text detection. We further argue that these risks are complementary to—and distinct from—the much-discussed risks of other impersonation attacks such as image, voice, or video deepfakes, and are not adequately addressed by the larger research community, or the current generation of open- and closed-source models."}],"title":"Position: It's time to act on the risk of efficient personalized text generation","year":"2025","oa":1,"OA_type":"green","external_id":{"arxiv":["2502.06560"]},"_id":"21858","article_processing_charge":"No","language":[{"iso":"eng"}],"doi":"10.48550/arXiv.2502.06560","OA_place":"repository","status":"public","project":[{"name":"FastML: Efficient and Cost-Effective Distributed Machine Learning","grant_number":"101158077","_id":"8e35c14b-16d5-11f0-9cad-a3fc35339161"},{"name":"Vienna Graduate School on Computational Optimization","_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A","grant_number":"W1260-N35"}],"date_created":"2026-05-11T08:55:23Z","month":"06","citation":{"ieee":"E. B. Iofinova, A. Jovanovic, and D.-A. Alistarh, “Position: It’s time to act on the risk of efficient personalized text generation,” <i>arXiv</i>. .","mla":"Iofinova, Eugenia B., et al. “Position: It’s Time to Act on the Risk of Efficient Personalized Text Generation.” <i>ArXiv</i>, doi:<a href=\"https://doi.org/10.48550/arXiv.2502.06560\">10.48550/arXiv.2502.06560</a>.","ama":"Iofinova EB, Jovanovic A, Alistarh D-A. Position: It’s time to act on the risk of efficient personalized text generation. <i>arXiv</i>. doi:<a href=\"https://doi.org/10.48550/arXiv.2502.06560\">10.48550/arXiv.2502.06560</a>","apa":"Iofinova, E. B., Jovanovic, A., &#38; Alistarh, D.-A. (n.d.). Position: It’s time to act on the risk of efficient personalized text generation. <i>arXiv</i>. <a href=\"https://doi.org/10.48550/arXiv.2502.06560\">https://doi.org/10.48550/arXiv.2502.06560</a>","short":"E.B. Iofinova, A. Jovanovic, D.-A. Alistarh, ArXiv (n.d.).","ista":"Iofinova EB, Jovanovic A, Alistarh D-A. Position: It’s time to act on the risk of efficient personalized text generation. arXiv, <a href=\"https://doi.org/10.48550/arXiv.2502.06560\">10.48550/arXiv.2502.06560</a>.","chicago":"Iofinova, Eugenia B, Andrej Jovanovic, and Dan-Adrian Alistarh. “Position: It’s Time to Act on the Risk of Efficient Personalized Text Generation.” <i>ArXiv</i>, n.d. <a href=\"https://doi.org/10.48550/arXiv.2502.06560\">https://doi.org/10.48550/arXiv.2502.06560</a>."},"arxiv":1,"main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2502.06560"}],"oa_version":"Preprint","author":[{"id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","orcid":"0000-0002-7778-3221","last_name":"Iofinova","first_name":"Eugenia B","full_name":"Iofinova, Eugenia B"},{"last_name":"Jovanovic","full_name":"Jovanovic, Andrej","first_name":"Andrej"},{"full_name":"Alistarh, Dan-Adrian","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","last_name":"Alistarh"}],"user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","corr_author":"1","acknowledgement":"This research was supported by the Scientific Service Units (SSU) of IST Austria through resources\r\nprovided by Scientific Computing (SciComp). EI was supported in part by the FWF DK VGSCO,\r\ngrant agreement number W1260-N35. AJ was supported in part by ERC Proof-of-Concept Grant\r\nFastML, grant agreement 101158077.","date_updated":"2026-07-27T12:50:03Z","related_material":{"record":[{"relation":"dissertation_contains","status":"public","id":"21854"}]},"publication":"arXiv","day":"02","department":[{"_id":"GradSch"},{"_id":"DaAl"}],"type":"preprint","date_published":"2025-06-02T00:00:00Z","publication_status":"draft"}]