@inproceedings{17378,
  abstract     = {Generative Pre-trained Transformer models, known as GPT or OPT, set themselves apart through breakthrough performance across complex language modelling tasks, but also by their extremely high computational and storage costs. Specifically, due to their massive size, even inference for large, highly-accurate GPT models may require multiple performant GPUs, which limits the usability of such models. While there is emerging work on relieving this pressure via model compression, the applicability and performance of existing compression techniques is limited by the scale and complexity of GPT models. In this paper, we address this challenge, and propose OPTQ, a new one-shot weight quantization method based on approximate second-order information, that is both highly-accurate and highly-efficient. Specifically, OPTQ can quantize GPT models with 175 billion parameters in approximately four GPU hours, reducing the bitwidth down to 3 or 4 bits per weight, with negligible accuracy degradation relative to the uncompressed baseline. Our method more than doubles the compression gains relative to previously-proposed one-shot quantization methods, preserving accuracy, allowing us for the first time to execute an 175 billion-parameter model inside a single GPU for generative inference. Moreover, we also show that our method can still provide reasonable accuracy in the extreme quantization regime, in which weights are quantized to 2-bit or even ternary quantization levels. We show experimentally that these improvements can be leveraged for end-to-end inference speedups over FP16, of around 3.25x when using high-end GPUs (NVIDIA A100) and 4.5x when using more cost-effective ones (NVIDIA A6000). The implementation is available at https://github.com/IST-DASLab/gptq.},
  author       = {Frantar, Elias and Ashkboos, Saleh and Hoefler, Torsten and Alistarh, Dan-Adrian},
  booktitle    = {11th International Conference on Learning Representations },
  location     = {Kigali, Rwanda},
  publisher    = {International Conference on Learning Representations},
  title        = {{OPTQ: Accurate post-training quantization for generative pre-trained transformers}},
  year         = {2023},
}

@inproceedings{14458,
  abstract     = {We show for the first time that large-scale generative pretrained transformer (GPT) family models can be pruned to at least 50% sparsity in one-shot, without any retraining, at minimal loss of accuracy. This is achieved via a new pruning method called SparseGPT, specifically designed to work efficiently and accurately on massive GPT-family models. We can execute SparseGPT on the largest available open-source models, OPT-175B and BLOOM-176B, in under 4.5 hours, and can reach 60% unstructured sparsity with negligible increase in perplexity: remarkably, more than 100 billion weights from these models can be ignored at inference time. SparseGPT generalizes to semi-structured (2:4 and 4:8) patterns, and is compatible with weight quantization approaches. The code is available at: https://github.com/IST-DASLab/sparsegpt.},
  author       = {Frantar, Elias and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {10323--10337},
  publisher    = {ML Research Press},
  title        = {{SparseGPT: Massive language models can be accurately pruned in one-shot}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{14459,
  abstract     = {Autoencoders are a popular model in many branches of machine learning and lossy data compression. However, their fundamental limits, the performance of gradient methods and the features learnt during optimization remain poorly understood, even in the two-layer setting. In fact, earlier work has considered either linear autoencoders or specific training regimes (leading to vanishing or diverging compression rates). Our paper addresses this gap by focusing on non-linear two-layer autoencoders trained in the challenging proportional regime in which the input dimension scales linearly with the size of the representation. Our results characterize the minimizers of the population risk, and show that such minimizers are achieved by gradient methods; their structure is also unveiled, thus leading to a concise description of the features obtained via training. For the special case of a sign activation function, our analysis establishes the fundamental limits for the lossy compression of Gaussian sources via (shallow) autoencoders. Finally, while the results are proved for Gaussian data, numerical simulations on standard datasets display the universality of the theoretical predictions.},
  author       = {Shevchenko, Aleksandr and Kögler, Kevin and Hassani, Hamed and Mondelli, Marco},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {31151--31209},
  publisher    = {ML Research Press},
  title        = {{Fundamental limits of two-layer autoencoders, and achieving them with gradient methods}},
  volume       = {202},
  year         = {2023},
}

@inproceedings{11180,
  abstract     = {Designing and implementing efficient parallel priority schedulers is an active research area. An intriguing proposed design is the Multi-Queue: given n threads and m ≥ n distinct priority queues, task insertions are performed uniformly at random, while, to delete, a thread picks two queues uniformly at random, and removes the observed task of higher priority. This approach scales well, and has probabilistic rank guarantees: roughly, the rank of each task removed, relative to remaining tasks in all other queues, is O (m) in expectation. Yet, the performance of this pattern is below that of well-engineered schedulers, which eschew theoretical guarantees for practical efficiency.

We investigate whether it is possible to design and implement a Multi-Queue-based task scheduler that is both highly-efficient and has analytical guarantees. We propose a new variant called the Stealing Multi-Queue (SMQ), a cache-efficient variant of the Multi-Queue, which leverages both queue affinity---each thread has a local queue, from which tasks are usually removed; but, with some probability, threads also attempt to steal higher-priority tasks from the other queues---and task batching, that is, the processing of several tasks in a single insert / remove step. These ideas are well-known for task scheduling without priorities; our theoretical contribution is showing that, despite relaxations, this design can still provide rank guarantees, which in turn implies bounds on total work performed. We provide a general SMQ implementation which can surpass state-of-the-art schedulers such as OBIM and PMOD in terms of performance on popular graph-processing benchmarks. Notably, the performance improvement comes mainly from the superior rank guarantees provided by our scheduler, confirming that analytically-reasoned approaches can still provide performance improvements for priority task scheduling.},
  author       = {Postnikova, Anastasiia and Koval, Nikita and Nadiradze, Giorgi and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  isbn         = {9781450392044},
  location     = {Seoul, Republic of Korea},
  pages        = {353--367},
  publisher    = {Association for Computing Machinery},
  title        = {{Multi-queues can be state-of-the-art priority schedulers}},
  doi          = {10.1145/3503221.3508432},
  year         = {2022},
}

@inproceedings{11181,
  abstract     = {To maximize the performance of concurrent data structures, researchers have often turned to highly complex fine-grained techniques, resulting in efficient and elegant algorithms, which can however be often difficult to understand and prove correct. While simpler techniques exist, such as transactional memory, they can have limited performance or portability relative to their fine-grained counterparts. Approaches at both ends of this complexity-performance spectrum have been extensively explored, but relatively less is known about the middle ground: approaches that are willing to sacrifice some performance for simplicity, while remaining competitive with state-of-the-art handcrafted designs. In this paper, we explore this middle ground, and present PathCAS, a primitive that combines ideas from multi-word CAS (KCAS) and transactional memory approaches, while carefully avoiding overhead. We show how PathCAS can be used to implement efficient search data structures relatively simply, using an internal binary search tree as an example, then extending this to an AVL tree. Our best implementations outperform many handcrafted search trees: in search-heavy workloads, it rivals the BCCO tree [5], the fastest known concurrent binary tree in terms of search performance [3]. Our results suggest that PathCAS can yield concurrent data structures that are relatively easy to build and prove correct, while offering surprisingly high performance.},
  author       = {Brown, Trevor A and Sigouin, William and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  isbn         = {9781450392044},
  location     = {Seoul, Republic of Korea},
  pages        = {385--399},
  publisher    = {Association for Computing Machinery},
  title        = {{PathCAS: An efficient middle ground for concurrent search data structures}},
  doi          = {10.1145/3503221.3508410},
  year         = {2022},
}

@inproceedings{11183,
  abstract     = {Subgraph detection has recently been one of the most studied problems in the CONGEST model of distributed computing. In this work, we study the distributed complexity of problems closely related to subgraph detection, mainly focusing on induced subgraph detection. The main line of this work presents lower bounds and parameterized algorithms w.r.t structural parameters of the input graph:
- On general graphs, we give unconditional lower bounds for induced detection of cycles and patterns of treewidth 2 in CONGEST. Moreover, by adapting reductions from centralized parameterized complexity, we prove lower bounds in CONGEST for detecting patterns with a 4-clique, and for induced path detection conditional on the hardness of triangle detection in the congested clique.
- On graphs of bounded degeneracy, we show that induced paths can be detected fast in CONGEST using techniques from parameterized algorithms, while detecting cycles and patterns of treewidth 2 is hard.
- On graphs of bounded vertex cover number, we show that induced subgraph detection is easy in CONGEST for any pattern graph. More specifically, we adapt a centralized parameterized algorithm for a more general maximum common induced subgraph detection problem to the distributed setting. In addition to these induced subgraph detection results, we study various related problems in the CONGEST and congested clique models, including for multicolored versions of subgraph-detection-like problems.},
  author       = {Nikabadi, Amir and Korhonen, Janne},
  booktitle    = {25th International Conference on Principles of Distributed Systems},
  editor       = {Bramas, Quentin and Gramoli, Vincent and Milani, Alessia},
  isbn         = {9783959772198},
  issn         = {1868-8969},
  location     = {Strasbourg, France},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Beyond distributed subgraph detection: Induced subgraphs, multicolored problems and graph parameters}},
  doi          = {10.4230/LIPIcs.OPODIS.2021.15},
  volume       = {217},
  year         = {2022},
}

@inproceedings{11184,
  abstract     = {Let G be a graph on n nodes. In the stochastic population protocol model, a collection of n indistinguishable, resource-limited nodes collectively solve tasks via pairwise interactions. In each interaction, two randomly chosen neighbors first read each other’s states, and then update their local states. A rich line of research has established tight upper and lower bounds on the complexity of fundamental tasks, such as majority and leader election, in this model, when G is a clique. Specifically, in the clique, these tasks can be solved fast, i.e., in n polylog n pairwise interactions, with high probability, using at most polylog n states per node.
In this work, we consider the more general setting where G is an arbitrary regular graph, and present a technique for simulating protocols designed for fully-connected networks in any connected regular graph. Our main result is a simulation that is efficient on many interesting graph families: roughly, the simulation overhead is polylogarithmic in the number of nodes, and quadratic in the conductance of the graph. As a sample application, we show that, in any regular graph with conductance φ, both leader election and exact majority can be solved in φ^{-2} ⋅ n polylog n pairwise interactions, with high probability, using at most φ^{-2} ⋅ polylog n states per node. This shows that there are fast and space-efficient population protocols for leader election and exact majority on graphs with good expansion properties. We believe our results will prove generally useful, as they allow efficient technology transfer between the well-mixed (clique) case, and the under-explored spatial setting.},
  author       = {Alistarh, Dan-Adrian and Gelashvili, Rati and Rybicki, Joel},
  booktitle    = {25th International Conference on Principles of Distributed Systems},
  editor       = {Bramas, Quentin and Gramoli, Vincent and Milani, Alessia},
  isbn         = {9783959772198},
  issn         = {1868-8969},
  location     = {Strasbourg, France},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Fast graphical population protocols}},
  doi          = {10.4230/LIPIcs.OPODIS.2021.14},
  volume       = {217},
  year         = {2022},
}

@inproceedings{11707,
  abstract     = {In this work we introduce the graph-theoretic notion of mendability: for each locally checkable graph problem we can define its mending radius, which captures the idea of how far one needs to modify a partial solution in order to “patch a hole.” We explore how mendability is connected to the existence of efficient algorithms, especially in distributed, parallel, and fault-tolerant settings. It is easy to see that O(1)-mendable problems are also solvable in O(log∗n) rounds in the LOCAL model of distributed computing. One of the surprises is that in paths and cycles, a converse also holds in the following sense: if a problem Π can be solved in O(log∗n), there is always a restriction Π′⊆Π that is still efficiently solvable but that is also O(1)-mendable. We also explore the structure of the landscape of mendability. For example, we show that in trees, the mending radius of any locally checkable problem is O(1), Θ(logn), or Θ(n), while in general graphs the structure is much more diverse.},
  author       = {Balliu, Alkida and Hirvonen, Juho and Melnyk, Darya and Olivetti, Dennis and Rybicki, Joel and Suomela, Jukka},
  booktitle    = {International Colloquium on Structural Information and Communication Complexity},
  editor       = {Parter, Merav},
  isbn         = {9783031099922},
  issn         = {1611-3349},
  location     = {Paderborn, Germany},
  pages        = {1--20},
  publisher    = {Springer Nature},
  title        = {{Local mending}},
  doi          = {10.1007/978-3-031-09993-9_1},
  volume       = {13298},
  year         = {2022},
}

@inproceedings{11844,
  abstract     = {In the stochastic population protocol model, we are given a connected graph with n nodes, and in every time step, a scheduler samples an edge of the graph uniformly at random and the nodes connected by this edge interact. A fundamental task in this model is stable leader election, in which all nodes start in an identical state and the aim is to reach a configuration in which (1) exactly one node is elected as leader and (2) this node remains as the unique leader no matter what sequence of interactions follows. On cliques, the complexity of this problem has recently been settled: time-optimal protocols stabilize in Θ(n log n) expected steps using Θ(log log n) states, whereas protocols that use O(1) states require Θ(n2) expected steps.

In this work, we investigate the complexity of stable leader election on general graphs. We provide the first non-trivial time lower bounds for leader election on general graphs, showing that, when moving beyond cliques, the complexity landscape of leader election becomes very diverse: the time required to elect a leader can range from O(1) to Θ(n3) expected steps. On the upper bound side, we first observe that there exists a protocol that is time-optimal on many graph families, but uses polynomially-many states. In contrast, we give a near-time-optimal protocol that uses only O(log2n) states that is at most a factor log n slower. Finally, we show that the constant-state protocol of Beauquier et al. [OPODIS 2013] is at most a factor n log n slower than the fast polynomial-state protocol. Moreover, among constant-state protocols, this protocol has near-optimal average case complexity on dense random graphs.},
  author       = {Alistarh, Dan-Adrian and Rybicki, Joel and Voitovych, Sasha},
  booktitle    = {Proceedings of the Annual ACM Symposium on Principles of Distributed Computing},
  isbn         = {9781450392624},
  location     = {Salerno, Italy},
  pages        = {246--256},
  publisher    = {Association for Computing Machinery},
  title        = {{Near-optimal leader election in population protocols on graphs}},
  doi          = {10.1145/3519270.3538435},
  year         = {2022},
}

@inproceedings{12182,
  abstract     = {Online algorithms make decisions based on past inputs, with the goal of being competitive against an algorithm that sees also future inputs. In this work, we introduce time-local online algorithms; these are online algorithms in which the output at any given time is a function of only T latest inputs. Our main observation is that time-local online algorithms are closely connected to local distributed graph algorithms: distributed algorithms make decisions based on the local information in the spatial dimension, while time-local online algorithms make decisions based on the local information in the temporal dimension. We formalize this connection, and show how we can directly use the tools developed to study distributed approximability of graph optimization problems to prove upper and lower bounds on the competitive ratio achieved with time-local online algorithms. Moreover, we show how to use computational techniques to synthesize optimal time-local algorithms.},
  author       = {Pacut, Maciej and Parham, Mahmoud and Rybicki, Joel and Schmid, Stefan and Suomela, Jukka and Tereshchenko, Aleksandr},
  booktitle    = {36th International Symposium on Distributed Computing},
  issn         = {1868-8969},
  location     = {Augusta, GA, United States},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Brief announcement: Temporal locality in online algorithms}},
  doi          = {10.4230/LIPIcs.DISC.2022.52},
  volume       = {246},
  year         = {2022},
}

@inproceedings{12299,
  abstract     = {Transfer learning is a classic paradigm by which models pretrained on large “upstream” datasets are adapted to yield good results on “downstream” specialized datasets. Generally, more accurate models on the “upstream” dataset tend to provide better transfer accuracy “downstream”. In this work, we perform an in-depth investigation of this phenomenon in the context of convolutional neural networks (CNNs) trained on the ImageNet dataset, which have been pruned-that is, compressed by sparsifiying their connections. We consider transfer using unstructured pruned models obtained by applying several state-of-the-art pruning methods, including magnitude-based, second-order, regrowth, lottery-ticket, and regularization approaches, in the context of twelve standard transfer tasks. In a nutshell, our study shows that sparse models can match or even outperform the transfer performance of dense models, even at high sparsities, and, while doing so, can lead to significant inference and even training speedups. At the same time, we observe and analyze significant differences in the behaviour of different pruning methods. The code is available at: https://github.com/IST-DASLab/sparse-imagenet-transfer.},
  author       = {Iofinova, Eugenia B and Peste, Elena-Alexandra and Kurtz, Mark and Alistarh, Dan-Adrian},
  booktitle    = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  issn         = {2575-7075},
  location     = {New Orleans, LA, United States},
  pages        = {12256--12266},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{How well do sparse ImageNet models transfer?}},
  doi          = {10.1109/cvpr52688.2022.01195},
  year         = {2022},
}

@inproceedings{12780,
  abstract     = {The ability to scale out training workloads has been one of the key performance enablers of deep learning. The main scaling approach is data-parallel GPU-based training, which has been boosted by hardware and software support for highly efficient point-to-point communication, and in particular via hardware bandwidth over-provisioning. Overprovisioning comes at a cost: there is an order of magnitude price difference between "cloud-grade" servers with such support, relative to their popular "consumer-grade" counterparts, although single server-grade and consumer-grade GPUs can have similar computational envelopes.

In this paper, we show that the costly hardware overprovisioning approach can be supplanted via algorithmic and system design, and propose a framework called CGX, which provides efficient software support for compressed communication in ML applications, for both multi-GPU single-node training, as well as larger-scale multi-node training. CGX is based on two technical advances: At the system level, it relies on a re-developed communication stack for ML frameworks, which provides flexible, highly-efficient support for compressed communication. At the application level, it provides seamless, parameter-free integration with popular frameworks, so that end-users do not have to modify training recipes, nor significant training code. This is complemented by a layer-wise adaptive compression technique which dynamically balances compression gains with accuracy preservation. CGX integrates with popular ML frameworks, providing up to 3X speedups for multi-GPU nodes based on commodity hardware, and order-of-magnitude improvements in the multi-node setting, with negligible impact on accuracy.},
  author       = {Markov, Ilia and Ramezanikebrya, Hamidreza and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 23rd ACM/IFIP International Middleware Conference},
  isbn         = {9781450393409},
  location     = {Quebec, QC, Canada},
  pages        = {241--254},
  publisher    = {Association for Computing Machinery},
  title        = {{CGX: Adaptive system support for communication-efficient deep learning}},
  doi          = {10.1145/3528535.3565248},
  year         = {2022},
}

@misc{13076,
  abstract     = {The source code for replicating experiments presented in the paper.

The implementation of the designed priority schedulers can be found in Galois-2.2.1/include/Galois/WorkList/:
StealingMultiQueue.h is the StealingMultiQueue.
MQOptimized/ contains MQ Optimized variants.

We provide images that contain all the dependencies and datasets. Images can be pulled from npostnikova/mq-based-schedulers repository, or downloaded from Zenodo. See readme for more detail.},
  author       = {Postnikova, Anastasiia and Koval, Nikita and Nadiradze, Giorgi and Alistarh, Dan-Adrian},
  publisher    = {Zenodo},
  title        = {{Multi-queues can be state-of-the-art priority schedulers}},
  doi          = {10.5281/ZENODO.5733408},
  year         = {2022},
}

@inproceedings{17059,
  abstract     = {The recent focus on the efficiency of deep neural networks (DNNs) has led to significant work on model compression approaches, of which weight pruning is one of the most popular. At the same time, there is rapidly-growing computational support for efficiently executing the unstructured-sparse models obtained via pruning. Yet, most existing pruning methods minimize just the number of remaining weights, i.e. the size of the model, rather than optimizing for inference time. We address this gap by introducing SPDY, a new compression method which automatically determines layer-wise sparsity targets achieving a desired inference speedup on a given system, while minimizing accuracy loss. SPDY is the composition of two new techniques. The first is an efficient and general dynamic programming algorithm for solving constrained layer-wise compression problems, given a set of layer-wise error scores. The second technique is a local search procedure for automatically determining such scores in an accurate and robust manner. Experiments across popular vision and language models show that SPDY guarantees speedups while recovering higher accuracy relative to existing strategies, both for one-shot and gradual pruning scenarios, and is compatible with most existing pruning approaches. We also extend our approach to the recently-proposed task of pruning with very little data, where we achieve the best known accuracy recovery when pruning to the GPU-supported 2:4 sparsity pattern.},
  author       = {Frantar, Elias and Alistarh, Dan-Adrian},
  booktitle    = {39th International Conference on Machine Learning},
  location     = {Baltimore, MD, United States},
  pages        = {6726--6743},
  publisher    = {ML Research Press},
  title        = {{SPDY: Accurate pruning with speedup guarantees}},
  volume       = {162},
  year         = {2022},
}

@inproceedings{17088,
  abstract     = {In this paper, we consider the problem of sparsifying BERT models, which are a key building block for natural language processing, in order to reduce their storage and computational cost. We introduce the Optimal BERT Surgeon (oBERT), an efficient and accurate pruning method based on approximate second-order information, which we show to yield state-of-the-art results in both stages of language tasks: pre-training and fine-tuning. Specifically, oBERT extends existing work on second-order pruning by allowing for pruning weight blocks, and is the first such method that is applicable at BERT scale. Second, we investigate compounding compression approaches to obtain highly compressed but accurate models for deployment on edge devices. These models significantly push boundaries of the current state-of-the-art sparse BERT models with respect to all metrics: model size, inference speed and task accuracy. For example, relative to the dense BERT-base, we obtain 10x model size compression with < 1% accuracy drop, 10x CPU-inference speedup with < 2% accuracy drop, and 29x CPU-inference speedup with < 7.5% accuracy drop. Our code, fully integrated with Transformers and SparseML, is available at https://github.com/neuralmagic/sparseml/tree/main/research/optimal_BERT_surgeon_oBERT.},
  author       = {Kurtic, Eldar and Campos, Daniel and Nguyen, Tuan and Frantar, Elias and Kurtz, Mark and Fineran, Benjamin and Goin, Michael and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
  location     = {Abu Dhabi, United Arab Emirates},
  pages        = {4163--4181},
  publisher    = {Association for Computational Linguistics},
  title        = {{The optimal BERT surgeon: Scalable and accurate second-order pruning for large language models}},
  doi          = {10.18653/v1/2022.emnlp-main.279},
  year         = {2022},
}

@article{8286,
  abstract     = {We consider the following dynamic load-balancing process: given an underlying graph G with n nodes, in each step t≥ 0, one unit of load is created, and placed at a randomly chosen graph node. In the same step, the chosen node picks a random neighbor, and the two nodes balance their loads by averaging them. We are interested in the expected gap between the minimum and maximum loads at nodes as the process progresses, and its dependence on n and on the graph structure. Variants of the above graphical balanced allocation process have been studied previously by Peres, Talwar, and Wieder [Peres et al., 2015], and by Sauerwald and Sun [Sauerwald and Sun, 2015]. These authors left as open the question of characterizing the gap in the case of cycle graphs in the dynamic case, where weights are created during the algorithm’s execution. For this case, the only known upper bound is of 𝒪(n log n), following from a majorization argument due to [Peres et al., 2015], which analyzes a related graphical allocation process. In this paper, we provide an upper bound of 𝒪 (√n log n) on the expected gap of the above process for cycles of length n. We introduce a new potential analysis technique, which enables us to bound the difference in load between k-hop neighbors on the cycle, for any k ≤ n/2. We complement this with a "gap covering" argument, which bounds the maximum value of the gap by bounding its value across all possible subsets of a certain structure, and recursively bounding the gaps within each subset. We provide analytical and experimental evidence that our upper bound on the gap is tight up to a logarithmic factor. },
  author       = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Sabour, Amirmojtaba},
  issn         = {1432-0541},
  journal      = {Algorithmica},
  location     = {Virtual, Online; Germany},
  number       = {4},
  pages        = {1007--1029},
  publisher    = {Springer Nature},
  title        = {{Dynamic averaging load balancing on cycles}},
  doi          = {10.1007/s00453-021-00905-9},
  volume       = {84},
  year         = {2022},
}

@inproceedings{17087,
  abstract     = {We consider the problem of model compression for deep neural networks (DNNs) in the challenging one-shot/post-training setting, in which we are given an accurate trained model, and must compress it without any retraining, based only on a small amount of calibration input data. This problem has become popular in view of the emerging software and hardware support for executing models compressed via pruning and/or quantization with speedup, and well-performing solutions have been proposed independently for both compression approaches.In this paper, we introduce a new compression framework which covers both weight pruning and quantization in a unified setting, is time- and space-efficient, and considerably improves upon the practical performance of existing post-training methods. At the technical level, our approach is based on an exact and efficient realization of the classical Optimal Brain Surgeon (OBS) framework of [LeCun, Denker, and Solla, 1990] extended to also cover weight quantization at the scale of modern DNNs. From the practical perspective, our experimental results show that it can improve significantly upon the compression-accuracy trade-offs of existing post-training methods, and that it can enable the accurate compound application of both pruning and quantization in a post-training setting.},
  author       = {Frantar, Elias and Singh, Sidak Pal and Alistarh, Dan-Adrian},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  location     = {New Orleans, LA, United States},
  publisher    = {ML Research Press},
  title        = {{Optimal brain compression: A framework for accurate post-training quantization and pruning}},
  volume       = {35},
  year         = {2022},
}

@article{11420,
  abstract     = {Understanding the properties of neural networks trained via stochastic gradient descent (SGD) is at the heart of the theory of deep learning. In this work, we take a mean-field view, and consider a two-layer ReLU network trained via noisy-SGD for a univariate regularized regression problem. Our main result is that SGD with vanishingly small noise injected in the gradients is biased towards a simple solution: at convergence, the ReLU network implements a piecewise linear map of the inputs, and the number of “knot” points -- i.e., points where the tangent of the ReLU network estimator changes -- between two consecutive training inputs is at most three. In particular, as the number of neurons of the network grows, the SGD dynamics is captured by the solution of a gradient flow and, at convergence, the distribution of the weights approaches the unique minimizer of a related free energy, which has a Gibbs form. Our key technical contribution consists in the analysis of the estimator resulting from this minimizer: we show that its second derivative vanishes everywhere, except at some specific locations which represent the “knot” points. We also provide empirical evidence that knots at locations distinct from the data points might occur, as predicted by our theory.},
  author       = {Shevchenko, Aleksandr and Kungurtsev, Vyacheslav and Mondelli, Marco},
  issn         = {1533-7928},
  journal      = {Journal of Machine Learning Research},
  number       = {130},
  pages        = {1--55},
  publisher    = {Journal of Machine Learning Research},
  title        = {{Mean-field analysis of piecewise linear solutions for wide ReLU networks}},
  volume       = {23},
  year         = {2022},
}

@inproceedings{10853,
  abstract     = {Dynamic Connectivity is a fundamental algorithmic graph problem, motivated by a wide range of applications to social and communication networks and used as a building block in various other algorithms, such as the bi-connectivity and the dynamic minimal spanning tree problems. In brief, we wish to maintain the connected components of the graph under dynamic edge insertions and deletions. In the sequential case, the problem has been well-studied from both theoretical and practical perspectives. However, much less is known about efficient concurrent solutions to this problem. This is the gap we address in this paper. We start from one of the classic data structures used to solve this problem, the Euler Tour Tree. Our first contribution is a non-blocking single-writer implementation of it. We leverage this data structure to obtain the first truly concurrent generalization of dynamic connectivity, which preserves the time complexity of its sequential counterpart, but is also scalable in practice. To achieve this, we rely on three main techniques. The first is to ensure that connectivity queries, which usually dominate real-world workloads, are non-blocking. The second non-trivial technique expands the above idea by making all queries that do not change the connectivity structure non-blocking. The third ingredient is applying fine-grained locking for updating the connected components, which allows operations on disjoint components to occur in parallel. We evaluate the resulting algorithm on various workloads, executing on both real and synthetic graphs. The results show the efficiency of each of the proposed optimizations; the most efficient variant improves the performance of a coarse-grained based implementation on realistic scenarios up to 6x on average and up to 30x when connectivity queries dominate.},
  author       = {Fedorov, Alexander and Koval, Nikita and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 33rd ACM Symposium on Parallelism in Algorithms and Architectures},
  isbn         = {9781450380706},
  location     = {Virtual, Online},
  pages        = {208--220},
  publisher    = {Association for Computing Machinery},
  title        = {{A scalable concurrent algorithm for dynamic connectivity}},
  doi          = {10.1145/3409964.3461810},
  year         = {2021},
}

@inproceedings{10854,
  abstract     = {Consider a distributed task where the communication network is fixed but the local inputs given to the nodes of the distributed system may change over time. In this work, we explore the following question: if some of the local inputs change, can an existing solution be updated efficiently, in a dynamic and distributed manner?
To address this question, we define the batch dynamic CONGEST model in which we are given a bandwidth-limited communication network and a dynamic edge labelling defines the problem input. The task is to maintain a solution to a graph problem on the labelled graph under batch changes. We investigate, when a batch of alpha edge label changes arrive, - how much time as a function of alpha we need to update an existing solution, and - how much information the nodes have to keep in local memory between batches in order to update the solution quickly.
Our work lays the foundations for the theory of input-dynamic distributed network algorithms. We give a general picture of the complexity landscape in this model, design both universal algorithms and algorithms for concrete problems, and present a general framework for lower bounds. The diverse time complexity of our model spans from constant time, through time polynomial in alpha, and to alpha time, which we show to be enough for any task.},
  author       = {Foerster, Klaus-Tycho and Korhonen, Janne and Paz, Ami and Rybicki, Joel and Schmid, Stefan},
  booktitle    = {Abstract Proceedings of the 2021 ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems},
  isbn         = {9781450380720},
  location     = {Virtual, Online},
  pages        = {71--72},
  publisher    = {Association for Computing Machinery},
  title        = {{Input-dynamic distributed algorithms for communication networks}},
  doi          = {10.1145/3410220.3453923},
  year         = {2021},
}