@phdthesis{21854,
  abstract     = {As neural-network-based models grow both in size and popularity, interest has grown in making the models smaller and more efficient to train. To that end, many methods have been proposed to prune models by reducing their number of nonzero parameters. Additionally, parameter-efficient fine-tuning, in which a much smaller number of parameters than the total contained in the model is updated during training, has become very popular, especially in the space of Large Language Models. At the same time, the increasingly routine deployment of machine learning in real-world applications has spurred a drive to make them more trustworthy - in the sense of, among other things, being unbiased, interpretable, and editable. In this thesis, we examine the interplay between efficiency and trustworthiness.

First, we analyze the effects of model pruning on bias in computer vision models, demonstrating that increased sparsity leads to greater bias, largely as a function of increased model uncertainty in marginal cases. Based on this observation, we propose several bias mitigation techniques. Then, we demonstrate that example-specific model pruning can improve model interpretation methods while improving pruning efficiency to make example-specific model pruning feasible in real time. Then, we investigate the effectiveness of parameter-efficient and data-efficient model personalization via fine-tuning, demonstrating that it is highly feasible with very small computational and data resources. Finally, we consider efficiency in editing model knowledge using a custom synthetic data framework, demonstrating that parameter-efficient, low-rank fine-tuning frequently outperforms full-rank fine-tuning, and, additionally, that restricting which model blocks are fine-tuned frequently improves results. Together, the results in this thesis provide new insights and techniques for combining trustworthiness and efficiency during neural network inference and training.

-----------------“In reference to IEEE copyrighted material which is used with permission in this thesis, the IEEE does not endorse any of [name of university or educational entity]’s products or services. Internal or personal use of this material is permitted. If interested in reprinting/republishing IEEE copyrighted material for advertising or promotional purposes or for creating new collective works for resale or redistribution, please go to http://www.ieee.org/publications_standards/publications/rights/rights_link.html to learn how to obtain a License from RightsLink. If applicable, University Microfilms and/or ProQuest Library, or the Archives of Canada may supply single copies of the dissertation.”},
  author       = {Iofinova, Eugenia B},
  issn         = {2663-337X},
  pages        = {237},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{On the utility and effects of efficiency in artificial neural networks}},
  doi          = {10.15479/AT-ISTA-21854},
  year         = {2026},
}

@misc{21857,
  abstract     = {The availability of powerful open-source large language models (LLMs) opens exciting use cases, such as using personal data to fine-tune these models to imitate a user’s unique writing style. Two key requirements for this functionality are personalization–in the sense that the output should recognizably reflect the user’s own writing style—and privacy–users may justifiably be wary of uploading extremely personal data, such as their email archive, to a third-party service. In this paper, we demonstrate the feasibility of training and running such an assistant, which we call Panza, on commodity hardware, for the specific use case of email generation. Panza’s personalization features are based on a combination of parameter-efficient fine-tuning using a variant of the Reverse Instructions technique [1] and Retrieval-Augmented Generation (RAG) [2]. We demonstrate that this combination allows us to fine-tune an LLM to reflect a user’s writing style using limited data, while executing on extremely limited resources, e.g. on a free Google Colab instance. Our key methodological contribution is the first detailed study of evaluation metrics for this task, and
of how different choices of system components–the use of RAG and of different fine-tuning approaches–impact the system’s performance. Additionally, we demonstrate that very little data - under 100 email samples - are sufficient to create models that convincingly imitate humans, showcasing a previously unknown attack vector in language models. We are releasing the full Panza code as well as three new email datasets licensed for research use.},
  author       = {Nicolicioiu, Armand and Iofinova, Eugenia B and Jovanovic, Andrej and Kurtic, Eldar and Nikdan, Mahdi and Panferov, Andrei and Markov, Ilia and Shavit, Nir and Alistarh, Dan-Adrian},
  booktitle    = {Third Conference on Parsimony and Learning (Proceedings Track)},
  keywords     = {LLMs, PEFT, LoRA, personalization, efficient ML},
  location     = {Tübíngen, Germany},
  publisher    = {OpenReview},
  title        = {{Panza: Investigating the feasibility of fully-local personalized text generation}},
  year         = {2026},
}

@unpublished{21859,
  abstract     = {As artificial neural networks, and specifically large language models, have improved rapidly in capabilities and quality, they have increasingly been deployed in real-world applications, from customer service to Google search, despite the fact that they frequently make factually incorrect or undesirable statements. This trend has inspired practical and academic interest in model editing, that is, in adjusting the weights of the model to modify its likely outputs for queries relating to a specific fact or set of facts. This may be done either to amend a fact or set of facts, for instance, to fix a frequent error in the training data, or to suppress a fact or set of facts entirely, for instance, in case of dangerous knowledge. Multiple methods have been proposed to do such edits. However, at the same time, it has been shown that such model editing can be brittle and incomplete. Moreover the effectiveness of any model editing method necessarily depends on the data on which the model is trained, and, therefore, a good understanding of the interaction of the training data distribution and the way it is stored in the network is necessary and helpful to reliably perform model editing. However, working with large language models trained on real-world data does not allow us to understand this relationship or fully measure the effects of model editing. We therefore propose Behemoth, a fully synthetic data generation framework. To demonstrate the practical insights from the framework, we explore model editing in the context of simple tabular data, demonstrating surprising findings that, in some cases, echo real-world results, for instance, that in some cases restricting the update rank results in a more effective update.},
  author       = {Iofinova, Eugenia B and Alistarh, Dan-Adrian},
  booktitle    = {arXiv},
  title        = {{Behemoth: Benchmarking unlearning in LLMs using fully synthetic data}},
  doi          = {10.48550/arXiv.2601.23153},
  year         = {2026},
}

@inproceedings{19877,
  abstract     = {As inference on Large Language Models (LLMs) emerges as an important workload in machine learning applications, model weight quantization has become a standard technique for efficient GPU deployment. Quantization not only reduces model size, but has also been shown to yield substantial speedups for single-user inference, due to reduced memory movement, with low accuracy impact. Yet, it remains a key open question whether speedups are achievable also in batched settings with multiple parallel clients, which are highly relevant for practical serving. It is unclear whether GPU kernels can be designed to remain practically memory-bound, while supporting the substantially increased compute requirements of batched workloads.
In this paper, we resolve this question positively by introducing a new design for Mixed-precision Auto-Regressive LINear kernels, called MARLIN. Concretely, given a model whose weights are compressed via quantization to, e.g., 4 bits per element, MARLIN shows that batchsizes up to 16-32 can be practically supported with close to maximum (4×) quantization speedup, and larger batchsizes up to 64-128 with gradually decreasing, but still significant, acceleration. MARLIN accomplishes this via a combination of techniques, such as asynchronous memory access, complex task scheduling and pipelining, and bespoke quantization support. Our experiments show that MARLIN's near-optimal performance on individual LLM layers across different scenarios can also lead to significant end-to-end LLM inference speedups (of up to 2.8×) when integrated with the popular vLLM open-source serving engine. Finally, we show that MARLIN is extensible to further compression techniques, like NVIDIA 2:4 sparsity, leading to additional speedups.},
  author       = {Frantar, Elias and Castro, Roberto L. and Chen, Jiale and Hoefler, Torsten and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming},
  isbn         = {9798400714436},
  location     = {Las Vegas, NV, United States},
  pages        = {239--251},
  publisher    = {Association for Computing Machinery},
  title        = {{MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models}},
  doi          = {10.1145/3710848.3710871},
  year         = {2025},
}

@article{19969,
  abstract     = {In the stochastic population protocol model, we are given a connected graph with n nodes, and in every time step, a scheduler samples an edge of the graph uniformly at random and the nodes connected by this edge interact. A fundamental task in this model is stable leader election, in which all nodes start in an identical state and the aim is to reach a configuration in which (1)
exactly one node is elected as leader and (2) this node remains as the unique leader no matter what sequence of interactions follows. On cliques, the complexity of this problem has recently been settled: time-optimal protocols stabilize in (n log n) expected steps using (log log n) states, whereas protocols that use O(1) states require (n2) expected steps. In this work, we investigate the complexity of stable leader election on graphs. We provide the first non-trivial time lower bounds on general graphs, showing that, when moving beyond cliques, the complexity of stable leader election can range from O(1) to (n3) expected steps. We describe a protocol that is time-optimal on many graph families, but uses polynomially-many states. In contrast, we give a near-time-optimal protocol that uses only O(log2 n) states that is at most a factor O(log n) slower. Finally, we observe that for many graphs the constant-state protocol of Beauquier et al. [OPODIS 2013] is at most a factor O(n log n) slower than the fast polynomial-state protocol, and among constant-state protocols, this protocol has near-optimal average case complexity on dense random graphs.},
  author       = {Alistarh, Dan-Adrian and Rybicki, Joel and Voitovych, Sasha},
  issn         = {1432-0452},
  journal      = {Distributed Computing},
  pages        = {207--245},
  publisher    = {Springer Nature},
  title        = {{Near-optimal leader election in population protocols on graphs}},
  doi          = {10.1007/s00446-025-00487-7},
  volume       = {38},
  year         = {2025},
}

@inproceedings{20032,
  abstract     = {We propose Scalable Mechanistic Neural Network (S-MNN), an enhanced neural network framework designed for scientific machine learning applications involving long temporal sequences. By reformulating the original Mechanistic Neural Network (MNN) (Pervez et al., 2024), we reduce the computational time and space complexities from cubic and quadratic with respect to the sequence length, respectively, to linear. This significant improvement enables efficient modeling of long-term dynamics without sacrificing accuracy or interpretability. Extensive experiments demonstrate that S-MNN matches the original MNN in precision while substantially reducing computational resources. Consequently, S-MNN can drop-in replace the original MNN in applications, providing a practical and efficient tool for integrating mechanistic bottlenecks into neural network models of complex dynamical systems. Source code is available at https://github.com/IST-DASLab/ScalableMNN.},
  author       = {Chen, Jiale and Yao, Dingling and Pervez, Adeel A and Alistarh, Dan-Adrian and Locatello, Francesco},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {63716--63737},
  publisher    = {ICLR},
  title        = {{Scalable mechanistic neural networks}},
  year         = {2025},
}

@inproceedings{20034,
  abstract     = {We introduce LDAdam, a memory-efficient optimizer for training large models, that performs adaptive optimization steps within lower dimensional subspaces, while consistently exploring the full parameter space during training. This strategy keeps the optimizer's memory footprint to a fraction of the model size. LDAdam relies on a new projection-aware update rule for the optimizer states that allows for transitioning between subspaces, i.e., estimation of the statistics of the projected gradients. To mitigate the errors due to low-rank projection, LDAdam integrates a new generalized error feedback mechanism, which explicitly accounts for both gradient and optimizer state compression. We prove the convergence of LDAdam under standard assumptions, and provide empirical evidence that LDAdam allows for efficient fine-tuning and pre-training of language models.},
  author       = {Robert, Thomas and Safaryan, Mher and Modoranu, Ionut-Vlad and Alistarh, Dan-Adrian},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {101877--101913},
  publisher    = {ICLR},
  title        = {{LDAdam: Adaptive optimization from low-dimensional gradient statistics}},
  year         = {2025},
}

@inproceedings{20037,
  abstract     = {Disentangling polysemantic neurons is at the core of many current approaches to interpretability of large language models. Here we attempt to study how disentanglement can be used to understand performance, particularly under weight sparsity, a leading post-training optimization technique. We suggest a novel measure for estimating neuronal entanglement: the Wasserstein distance of a neuron's output distribution to a Gaussian. Moreover, we show the existence of a small number of highly entangled "Wasserstein Neurons" in each linear layer of an LLM, characterized by their highly non-Gaussian output distributions, their role in mapping similar inputs to dissimilar outputs, and their significant impact on model accuracy. To study these phenomena, we propose a new experimental framework for disentangling polysemantic neurons. Our framework separates each layer's inputs to create a mixture of experts where each neuron's output is computed by a mixture of neurons of lower Wasserstein distance, each better at maintaining accuracy when sparsified without retraining. We provide strong evidence that this is because the mixture of sparse experts is effectively disentangling the input-output relationship of individual neurons, in particular the difficult Wasserstein neurons.},
  author       = {Sawmya, Shashata and Kong, Linghao and Markov, Ilia and Alistarh, Dan-Adrian and Shavit, Nir},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {26244--26274},
  publisher    = {ICLR},
  title        = {{Wasserstein distances, neuronal entanglement, and sparsity}},
  year         = {2025},
}

@inproceedings{20038,
  abstract     = {Pruning eliminates unnecessary parameters in neural networks; it offers a promising solution to the growing computational demands of large language models (LLMs). While many focus on post-training pruning, sparse pre-training--which combines pruning and pre-training into a single phase--provides a simpler alternative. In this work, we present the first systematic exploration of optimal sparse pre-training configurations for LLMs through an examination of 80 unique pruning schedules across different sparsity levels and training durations. We find that initiating pruning at 25% of total training compute and concluding at 75% achieves near-optimal final evaluation loss. These findings provide valuable insights for efficient and effective sparse pre-training of LLMs. Furthermore, we propose a new scaling law that modifies the Chinchilla scaling law to use the average parameter count over pre-training. Through empirical and theoretical validation, we demonstrate that this modified scaling law accurately models evaluation loss for both sparsely and densely pre-trained LLMs, unifying scaling laws across pre-training paradigms. Our findings indicate that while sparse pre-training achieves the same final model quality as dense pre-training for equivalent compute budgets, it provides substantial benefits through reduced model size, enabling significant potential computational savings during inference.},
  author       = {Jin, Tian and Humayun, Ahmed Imtiaz and Evci, Utku and Subramanian, Suvinay and Yazdanbakhsh, Amir and Alistarh, Dan-Adrian and Dziugaite, Gintare Karolina},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {85165--85181},
  publisher    = {ICLR},
  title        = {{The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws}},
  year         = {2025},
}

@inproceedings{20224,
  abstract     = {Traffic in datacenters may follow some pattern: some pairs of servers communicate more frequently than others. Demand-oblivious networks may perform poorly for such workloads, and demand-aware networks optimized for traffic should be used instead. Unfortunately, not all shapes of networks are feasible in real hardware. Practical limitations are usually provided in the form of a topology. For example, a network may be required to be a binary tree, a bounded-degree graph or a Fat tree.
In this work, we consider a topology of a binary tree, one of the most fundamental network topologies. We show that already finding an optimal demand-aware binary tree network is NP-hard. Then, we explore how various optimization techniques, including simple local searches, as well as deterministic mutation and crossover operators, cope with generating efficient tree networks on real-life and synthetic workloads.},
  author       = {Martynov, Pavel and Buzdalov, Maxim and Pankratov, Sergei and Aksenov, Vitaliy and Schmid, Stefan},
  booktitle    = {Proceedings of the 2025 Genetic and Evolutionary Computation Conference},
  isbn         = {9798400714658},
  location     = {Malaga, Spain},
  pages        = {249--257},
  publisher    = {Association for Computing Machinery},
  title        = {{In the search of optimal tree networks: Hardness and heuristics}},
  doi          = {10.1145/3712256.3726425},
  year         = {2025},
}

@inproceedings{20684,
  abstract     = {Quantization is a powerful tool for accelerating large language model (LLM) inference, but the accuracy-performance trade-offs across different formats remain unclear. In this paper, we conduct the most comprehensive empirical study to date, evaluating FP8, INT8, and INT4
quantization across academic benchmarks and real-world tasks on the entire Llama-3.1 model
family. Through over 500,000 evaluations, our investigation yields several key findings: (1) FP8 (W8A8-FP) is effectively lossless across all model scales, (2) well-tuned INT8 (W8A8-INT) achieves surprisingly low (1-3%) accuracy degradation, and (3) INT4 weightonly (W4A16-INT) is more competitive than expected, rivaling 8-bit quantization. Further, we investigate the optimal quantization format for different deployments by analyzing inference performance through the popular vLLM framework. Our analysis provides clear deployment recommendations: W4A16 is the most cost-efficient for synchronous setups, while W8A8 dominates in asynchronous
continuous batching. For mixed workloads, the optimal choice depends on the specific use
case. Our findings offer practical, data-driven guidelines for deploying quantized LLMs at scale—ensuring the best balance between speed, efficiency, and accuracy. },
  author       = {Kurtic, Eldar and Marques, Alexandre and Pandit, Shubhra and Kurtz, Mark and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics},
  isbn         = {9798891762510},
  issn         = {0736-587X},
  location     = {Vienna, Austria},
  pages        = {26872--26886},
  publisher    = {Association for Computational Linguistics},
  title        = {{“Give me BF16 or give me death”? Accuracy-performance trade-offs in LLM quantization}},
  year         = {2025},
}

@article{20704,
  abstract     = {Generative models have advanced significantly in sampling material systems with continuous variables, such as atomistic structures. However, their application to discrete variables, like atom types or spin states, remains underexplored. In this work, we introduce a discrete flow matching model, tailored for systems with discrete phase-space coordinates (e.g., the Ising model or a multicomponent system on a lattice). This approach enables a single model to sample free energy surfaces over a wide temperature range with minimal training overhead, and the model generation is scalable to larger lattice sizes than those in the training set. We demonstrate our approach on the 2D Ising model, showing efficient and reliable free energy sampling. These results highlight the potential of flow matching for low-cost, scalable free energy sampling in discrete systems and suggest promising extensions to alchemical degrees of freedom in crystalline materials. The codebase developed for this work is openly available at https://github.com/tuoping/alchemicalFES.},
  author       = {Tuo, Ping and Zeng, Zezhu and Chen, Jiale and Cheng, Bingqing},
  issn         = {1549-9626},
  journal      = {Journal of Chemical Theory and Computation},
  number       = {22},
  pages        = {11427--11435},
  publisher    = {American Chemical Society},
  title        = {{Scalable multitemperature free energy sampling of classical Ising spin states}},
  doi          = {10.1021/acs.jctc.5c01248},
  volume       = {21},
  year         = {2025},
}

@article{19713,
  abstract     = {Distributed optimization is the standard way of speeding up machine learning training, and most of the research in the area focuses on distributed first-order, gradient-based methods. Yet, there are settings where some computationally-bounded nodes may not be able to implement first-order, gradient-based optimization, while they could still contribute to joint optimization tasks. In this paper, we initiate the study of hybrid decentralized optimization, studying settings where nodes with zeroth-order and first-order optimization capabilities co-exist in a distributed system, and attempt to jointly solve an optimization task over some data distribution. We essentially show that, under reasonable parameter settings, such a system can not only withstand noisier zeroth-order agents but can even benefit from integrating such agents into the optimization process, rather than ignoring their information. At the core of our approach is a new analysis of distributed optimization with noisy and possibly-biased gradient estimators, which may be of independent interest. Our results hold for both convex and non-convex objectives. Experimental results on standard optimization tasks confirm our analysis, showing that hybrid first-zeroth order optimization can be practical, even when training deep neural networks.},
  author       = {Talaei, Shayan and Ansaripour, Matin and Nadiradze, Giorgi and Alistarh, Dan-Adrian},
  issn         = {2374-3468},
  journal      = {Proceedings of the 39th AAAI Conference on Artificial Intelligence},
  number       = {19},
  pages        = {20778--20786},
  publisher    = {Association for the Advancement of Artificial Intelligence},
  title        = {{Hybrid decentralized optimization: Leveraging both first- and zeroth-order optimizers for faster convergence}},
  doi          = {10.1609/aaai.v39i19.34290},
  volume       = {39},
  year         = {2025},
}

@inproceedings{20820,
  abstract     = {The high computational costs of large language models (LLMs) have led to a flurry of research on LLM compression, via methods such as quantization, sparsification, or structured pruning. A new frontier in this area is given by dynamic, non-uniform compression methods, which adjust the compression levels (e.g., sparsity) per-block or even per-layer in order to minimize accuracy loss, while guaranteeing a global compression threshold. Yet, current methods rely on estimating the "importance" of a given layer, implicitly assuming that layers contribute independently to the overall compression error. We begin from the motivating observation that this independence assumption does not generally hold for LLM compression: pruning a model further may even significantly recover performance. To address this, we propose EvoPress, a novel evolutionary framework for dynamic LLM compression. By formulating dynamic compression as a general optimization problem, EvoPress identifies optimal compression profiles in a highly efficient manner, and generalizes across diverse models and compression techniques. Via EvoPress, we achieve state-of-the-art performance for dynamic compression of Llama, Mistral, and Phi models, setting new benchmarks for structural pruning (block/layer dropping), unstructured sparsity, and quantization with dynamic bitwidths.},
  author       = {Sieberling, Oliver and Kuznedelev, Denis and Kurtic, Eldar and Alistarh, Dan-Adrian},
  booktitle    = {42nd International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vancouver, Canada},
  pages        = {55556--55590},
  publisher    = {ML Research Press},
  title        = {{EvoPress: Accurate dynamic model compression via evolutionary search}},
  volume       = {267},
  year         = {2025},
}

@inproceedings{20821,
  abstract     = {Modern deep neural networks exhibit heterogeneity across numerous layers of various types such as residuals, multi-head attention, etc., due to varying structures (dimensions, activation functions, etc.), distinct representation characteristics, which impact predictions. We develop a general layer-wise quantization framework with tight variance and code-length bounds, adapting to the heterogeneities over the course of training. We then apply a new layer-wise quantization technique within distributed variational inequalities (VIs), proposing a novel Quantized Optimistic Dual Averaging (QODA) algorithm with adaptive learning rates, which achieves competitive convergence rates for monotone VIs. We empirically show that QODA achieves up to a 150% speedup over the baselines in end-to-end training time for training Wasserstein GAN on 12+GPUs.},
  author       = {Nguyen, Anh Duc and Markov, Ilia and Wu, Frank Zhengqing and Ramezani-Kebrya, Ali and Antonakopoulos, Kimon and Alistarh, Dan-Adrian and Cevher, Volkan},
  booktitle    = {42nd International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vancouver, Canada},
  pages        = {46026--46072},
  publisher    = {ML Research Press},
  title        = {{Layer-wise quantization for quantized optimistic dual averaging}},
  volume       = {267},
  year         = {2025},
}

@inproceedings{21250,
  abstract     = {We investigate the step complexity of the Leader Election problem (and implementing the corresponding test-and-set object) in asynchronous shared memory, where processes communicate through registers supporting atomic read and write and must coordinate so that a single process becomes the leader. Determining tight step complexity bounds for solving this problem is one of the key open problems in the theory of shared memory distributed computing. The best known algorithm is a randomized tournament-tree, which has worst-case expected step complexity O(log N) for N processes. There are provably no deterministic wait-free algorithms, and only restricted lower bounds are known for obstruction-free and randomized wait-free algorithms. We introduce a new lower bound that establishes an Ω((log N)/(log log N + log Q)) step complexity for any obstruction-free Leader Election algorithm, where N is the number of processes, and 2 ≤ Q ≤ N is a bound on the value contention, which we define as the maximum number of different values that processes can be simultaneously poised to write to the same register in any execution of the algorithm. Our result is strictly stronger than previous bounds based on write contention. In particular, it implies new lower bounds on step complexity that depend on register size.},
  author       = {Alistarh, Dan-Adrian and Ellen, Faith and Fedorov, Alexander},
  booktitle    = {39th International Symposium on Distributed Computing},
  location     = {Berlin, Germany},
  pages        = {3:1--3:16},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{An almost-logarithmic lower bound for leader election with bounded value contention}},
  doi          = {10.4230/LIPIcs.DISC.2025.3},
  volume       = {356},
  year         = {2025},
}

@inbook{21257,
  abstract     = {We investigate the problem of accurate sparse fine-tuning of large language models (LLMs), that is, fine-tuning pre-trained LLMs on specialized tasks, while inducing sparsity in their weights. Our work is motivated by experiments showing that standard loss-based fine-tuning methods are not able to achieve high accuracy in this setting, especially at high sparsity targets. To address this issue, we perform a detailed study of knowledge distillation losses for fine-tuning of sparse models. We determine an L2-based distillation approach that we term ‘SquareHead’, which enables accurate recovery even at higher sparsities. Investigating the question of efficient inference, we show that sparse LLMs can be executed faster by taking advantage of sparsity. Specifically, we exhibit end-to-end results showing speedups enabled by sparsity, while recovering accuracy, on the following models and tasks, respectively: T5 for language translation, Whisper for speech translation, and open GPT-type models such as the Mosaic Pre-Trained Transformer (MPT) and Llama-2 models for text generation. In particular, for popular generative tasks, we show for the first time that sparse fine-tuning can reach 75% sparsity without drops in accuracy, and provide notable end-to-end speedups for inference on CPUs. Moreover, we also highlight that sparsity is compatible with other compression approaches, such as quantization.},
  author       = {Kurtic, Eldar and Kuznedelev, Denis and Frantar, Elias and Goinv, Michael and Pandit, Shubhra and Agarwalla, Abhinav and Nguyen, Tuan and Marques, Alexandre and Kurtz, Mark and Alistarh, Dan-Adrian},
  booktitle    = {Enhancing LLM Performance. Efficacy, Fine-Tuning, and Inference Techniques},
  editor       = {Passban, Peyman and Way, Andy and Rezagholizadeh, Mehdi},
  isbn         = {9783031857461},
  issn         = {2522-803X},
  pages        = {83--97},
  publisher    = {Springer Nature},
  title        = {{Sparse Fine-Tuning for Inference Acceleration of Large Language Models}},
  doi          = {10.1007/978-3-031-85747-8_6},
  year         = {2025},
}

@unpublished{21858,
  abstract     = {The recent surge in high-quality open-source Generative AI text models (colloquially: LLMs), as well as efficient finetuning techniques, have opened the possibility of creating high-quality personalized models that generate text attuned to a specific individual’s needs and are capable of credibly imitating their writing style by refining an open-source model with that person’s own data. The technology to create such models is accessible to private individuals, and training and running such models can be done cheaply on consumer-grade hardware. While these advancements are a huge gain for usability and privacy, this position paper argues that the practical feasibility of impersonating specific individuals also introduces novel safety risks. For instance, this technology enables the creation of phishing emails
or fraudulent social media accounts, based on small amounts of publicly available text, or by the individuals themselves to escape AI text detection. We further argue that these risks are complementary to—and distinct from—the much-discussed risks of other impersonation attacks such as image, voice, or video deepfakes, and are not adequately addressed by the larger research community, or the current generation of open- and closed-source models.},
  author       = {Iofinova, Eugenia B and Jovanovic, Andrej and Alistarh, Dan-Adrian},
  booktitle    = {arXiv},
  title        = {{Position: It's time to act on the risk of efficient personalized text generation}},
  doi          = {10.48550/arXiv.2502.06560},
  year         = {2025},
}

@inproceedings{18070,
  abstract     = {Parallel SGD in a shared-memory setting is oft-represented by the popular Hogwild! algorithm, in which lock-free updates are asynchronously performed by multiple computing processes. Unfortunately, scaling Hogwild! to distributed workers is largely unexplored. Specifically, it is unknown if any adaptation of Hogwild! to the popular decentralized multi-GPU setting offers any competitive speedup, either empirically or theoretically. In this work, we investigate the potential of decentralizing Hogwild! by incorporating simultaneously (a) asynchronous local gradient updates on the shared memory of GPUs, and (b) non-blocking asynchronous decentralized federated averaging. A naive direct implementation shows degradation in performance, arising from scheduling overheads and concurrent write conflicts on GPUs. To mitigate these drawbacks, we investigate and propose a new method, based on careful block selection rules, which update only portions of the parameter vectors. Our experiments show that the resulting decentralized training method exhibits improved throughput and competitive accuracy for standard image classification benchmarks on the CIFAR-10, CIFAR-100, and Imagenet datasets. On the theoretical side, we prove that our method guarantees sublinear ergodic convergence rates for non-convex objectives.},
  author       = {Chatterjee, Bapi and Kungurtsev, Vyacheslav and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 44th International Conference on Distributed Computing Systems},
  isbn         = {9798350386059},
  issn         = {2575-8411},
  location     = {Jersey City, NJ, United States},
  pages        = {857--868},
  publisher    = {IEEE},
  title        = {{Federated SGD with local asynchrony}},
  doi          = {10.1109/ICDCS60910.2024.00084},
  year         = {2024},
}

@inproceedings{18113,
  abstract     = {The emergence of accurate open large language models (LLMs) has led to a race towards performant quantization techniques which can enable their execution on end-user devices. In this paper, we revisit the problem of “extreme” LLM compression—defined as targeting extremely low bit counts, such as 2 to 3 bits per parameter—from the point of view of classic methods in Multi-Codebook Quantization (MCQ). Our algorithm, called AQLM, generalizes the classic Additive Quantization (AQ) approach for information retrieval to advance the state-of-the-art in LLM compression, via two innovations: 1) learned additive quantization of weight matrices in input-adaptive fashion, and 2) joint optimization of codebook parameters across each transformer blocks. Broadly, AQLM is the first scheme that is Pareto optimal in terms of accuracy-vs-model-size when compressing to less than 3 bits per parameter, and significantly improves upon all known schemes in the extreme compression (2bit) regime. In addition, AQLM is practical: we provide fast GPU and CPU implementations of AQLM for token generation, which enable us to match or outperform optimized FP16 implementations for speed, while executing in a much smaller memory footprint.},
  author       = {Egiazarian, Vage and Panferov, Andrei and Kuznedelev, Denis and Frantar, Elias and Babenko, Artem and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {12284--12303},
  publisher    = {ML Research Press},
  title        = {{Extreme compression of large language models via additive quantization}},
  volume       = {235},
  year         = {2024},
}

