@inproceedings{7201,
  abstract     = {Applying machine learning techniques to the quickly growing data in science and industry requires highly-scalable algorithms. Large datasets are most commonly processed "data parallel" distributed across many nodes. Each node's contribution to the overall gradient is summed using a global allreduce. This allreduce is the single communication and thus scalability bottleneck for most machine learning workloads. We observe that frequently, many gradient values are (close to) zero, leading to sparse of sparsifyable communications. To exploit this insight, we analyze, design, and implement a set of communication-efficient protocols for sparse input data, in conjunction with efficient machine learning algorithms which can leverage these primitives. Our communication protocols generalize standard collective operations, by allowing processes to contribute arbitrary sparse input data vectors. Our generic communication library, SparCML1, extends MPI to support additional features, such as non-blocking (asynchronous) operations and low-precision data representations. As such, SparCML and its techniques will form the basis of future highly-scalable machine learning frameworks.},
  author       = {Renggli, Cedric and Ashkboos, Saleh and Aghagolzadeh, Mehdi and Alistarh, Dan-Adrian and Hoefler, Torsten},
  booktitle    = {International Conference for High Performance Computing, Networking, Storage and Analysis, SC},
  isbn         = {9781450362290},
  issn         = {2167-4337},
  location     = {Denver, CO, Unites States},
  publisher    = {ACM},
  title        = {{SparCML: High-performance sparse communication for machine learning}},
  doi          = {10.1145/3295500.3356222},
  year         = {2019},
}

@inproceedings{7228,
  abstract     = {Traditional concurrent programming involves manipulating shared mutable state. Alternatives to this programming style are communicating sequential processes (CSP) and actor models, which share data via explicit communication. These models have been known for almost half a century, and have recently had started to gain significant traction among modern programming languages. The common abstraction for communication between several processes is the channel. Although channels are similar to producer-consumer data structures, they have different semantics and support additional operations, such as the select expression. Despite their growing popularity, most known implementations of channels use lock-based data structures and can be rather inefficient.

In this paper, we present the first efficient lock-free algorithm for implementing a communication channel for CSP programming. We provide implementations and experimental results in the Kotlin and Go programming languages. Our new algorithm outperforms existing implementations on many workloads, while providing non-blocking progress guarantee. Our design can serve as an example of how to construct general communication data structures for CSP and actor models. },
  author       = {Koval, Nikita and Alistarh, Dan-Adrian and Elizarov, Roman},
  booktitle    = {25th Anniversary of Euro-Par},
  isbn         = {978-3-0302-9399-4},
  issn         = {1611-3349},
  location     = {Göttingen, Germany},
  pages        = {317--333},
  publisher    = {Springer Nature},
  title        = {{Scalable FIFO channels for programming via communicating sequential processes}},
  doi          = {10.1007/978-3-030-29400-7_23},
  volume       = {11725},
  year         = {2019},
}

@inproceedings{7437,
  abstract     = {Most of today's distributed machine learning systems assume reliable networks: whenever two machines exchange information (e.g., gradients or models), the network should guarantee the delivery of the message. At the same time, recent work exhibits the impressive tolerance of machine learning algorithms to errors or noise arising from relaxed communication or synchronization. In this paper, we connect these two trends, and consider the following question: Can we design machine learning systems that are tolerant to network unreliability during training? With this motivation, we focus on a theoretical problem of independent interest-given a standard distributed parameter server architecture, if every communication between the worker and the server has a non-zero probability p of being dropped, does there exist an algorithm that still converges, and at what speed? The technical contribution of this paper is a novel theoretical analysis proving that distributed learning over unreliable network can achieve comparable convergence rate to centralized or distributed learning over reliable networks. Further, we prove that the influence of the packet drop rate diminishes with the growth of the number of parameter servers. We map this theoretical result onto a real-world scenario, training deep neural networks over an unreliable network layer, and conduct network simulation to validate the system improvement by allowing the networks to be unreliable.},
  author       = {Yu, Chen and Tang, Hanlin and Renggli, Cedric and Kassing, Simon and Singla, Ankit and Alistarh, Dan-Adrian and Zhang, Ce and Liu, Ji},
  booktitle    = {36th International Conference on Machine Learning, ICML 2019},
  isbn         = {9781510886988},
  location     = {Long Beach, CA, United States},
  pages        = {12481--12512},
  publisher    = {IMLS},
  title        = {{Distributed learning over unreliable networks}},
  volume       = {2019-June},
  year         = {2019},
}

@inproceedings{6673,
  abstract     = {Several classic problems in graph processing and computational geometry are solved via incremental algorithms, which split computation into a series of small tasks acting on shared state, which gets updated progressively. While the sequential variant of such algorithms usually specifies a fixed (but sometimes random) order in which the tasks should be performed, a standard approach to parallelizing such algorithms is to relax this constraint to allow for out-of-order parallel execution. This is the case for parallel implementations of Dijkstra's single-source shortest-paths (SSSP) algorithm, and for parallel Delaunay mesh triangulation. While many software frameworks parallelize incremental computation in this way, it is still not well understood whether this relaxed ordering approach can still provide any complexity guarantees. In this paper, we address this problem, and analyze the efficiency guarantees provided by a range of incremental algorithms when parallelized via relaxed schedulers. We show that, for algorithms such as Delaunay mesh triangulation and sorting by insertion, schedulers with a maximum relaxation factor of k in terms of the maximum priority inversion allowed will introduce a maximum amount of wasted work of O(łog n poly(k)), where n is the number of tasks to be executed. For SSSP, we show that the additional work is O(poly(k), dmax / wmin), where dmax is the maximum distance between two nodes, and wmin is the minimum such distance. In practical settings where n >> k, this suggests that the overheads of relaxation will be outweighed by the improved scalability of the relaxed scheduler. On the negative side, we provide lower bounds showing that certain algorithms will inherently incur a non-trivial amount of wasted work due to scheduler relaxation, even for relatively benign relaxed schedulers.},
  author       = {Alistarh, Dan-Adrian and Nadiradze, Giorgi and Koval, Nikita},
  booktitle    = {31st ACM Symposium on Parallelism in Algorithms and Architectures},
  isbn         = {9781450361842},
  location     = {Phoenix, AZ, United States},
  pages        = {145--154},
  publisher    = {ACM},
  title        = {{Efficiency guarantees for parallel incremental algorithms under relaxed schedulers}},
  doi          = {10.1145/3323165.3323201},
  year         = {2019},
}

@article{7214,
  abstract     = {Background: Many cancer genomes are extensively rearranged with highly aberrant chromosomal karyotypes. Structural and copy number variations in cancer genomes can be determined via abnormal mapping of sequenced reads to the reference genome. Recently it became possible to reconcile both of these types of large-scale variations into a karyotype graph representation of the rearranged cancer genomes. Such a representation, however, does not directly describe the linear and/or circular structure of the underlying rearranged cancer chromosomes, thus limiting possible analysis of cancer genomes somatic evolutionary process as well as functional genomic changes brought by the large-scale genome rearrangements.

Results: Here we address the aforementioned limitation by introducing a novel methodological framework for recovering rearranged cancer chromosomes from karyotype graphs. For a cancer karyotype graph we formulate an Eulerian Decomposition Problem (EDP) of finding a collection of linear and/or circular rearranged cancer chromosomes that are determined by the graph. We derive and prove computational complexities for several variations of the EDP. We then demonstrate that Eulerian decomposition of the cancer karyotype graphs is not always unique and present the Consistent Contig Covering Problem (CCCP) of recovering unambiguous cancer contigs from the cancer karyotype graph, and describe a novel algorithm CCR capable of solving CCCP in polynomial time. We apply CCR on a prostate cancer dataset and demonstrate that it is capable of consistently recovering large cancer contigs even when underlying cancer genomes are highly rearranged.

Conclusions: CCR can recover rearranged cancer contigs from karyotype graphs thereby addressing existing limitation in inferring chromosomal structures of rearranged cancer genomes and advancing our understanding of both patient/cancer-specific as well as the overall genetic instability in cancer.},
  author       = {Aganezov, Sergey and Zban, Ilya and Aksenov, Vitalii and Alexeev, Nikita and Schatz, Michael C.},
  issn         = {1471-2105},
  journal      = {BMC Bioinformatics},
  publisher    = {BMC},
  title        = {{Recovering rearranged cancer chromosomes from karyotype graphs}},
  doi          = {10.1186/s12859-019-3208-4},
  volume       = {20},
  year         = {2019},
}

@inproceedings{7542,
  abstract     = {We present a novel class of convolutional neural networks (CNNs) for set functions,i.e., data indexed with the powerset of a finite set. The convolutions are derivedas linear, shift-equivariant functions for various notions of shifts on set functions.The framework is fundamentally different from graph convolutions based on theLaplacian, as it provides not one but several basic shifts, one for each element inthe ground set. Prototypical experiments with several set function classificationtasks on synthetic datasets and on datasets derived from real-world hypergraphsdemonstrate the potential of our new powerset CNNs.},
  author       = {Wendler, Chris and Alistarh, Dan-Adrian and Püschel, Markus},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  pages        = {927--938},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Powerset convolutional neural networks}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{6933,
  abstract     = {We design fast deterministic algorithms for distance computation in the CONGESTED CLIQUE model. Our key contributions include:

 - A (2+ε)-approximation for all-pairs shortest paths problem in O(log²n / ε) rounds on unweighted undirected graphs. With a small additional additive factor, this also applies for weighted graphs. This is the first sub-polynomial constant-factor approximation for APSP in this model.
 - A (1+ε)-approximation for multi-source shortest paths problem from O(√n) sources in O(log² n / ε) rounds on weighted undirected graphs. This is the first sub-polynomial algorithm obtaining this approximation for a set of sources of polynomial size.

Our main techniques are new distance tools that are obtained via improved algorithms for sparse matrix multiplication, which we leverage to construct efficient hopsets and shortest paths. Furthermore, our techniques extend to additional distance problems for which we improve upon the state-of-the-art, including diameter approximation, and an exact single-source shortest paths algorithm for weighted undirected graphs in Õ(n^{1/6}) rounds.},
  author       = {Censor-Hillel, Keren and Dory, Michal and Korhonen, Janne and Leitersdorf, Dean},
  booktitle    = {Proceedings of the 2019 ACM Symposium on Principles of Distributed Computin},
  isbn         = {9781450362177},
  location     = {Toronto, ON, Canada},
  pages        = {74--83},
  publisher    = {ACM},
  title        = {{Fast approximate shortest paths in the congested clique}},
  doi          = {10.1145/3293611.3331633},
  year         = {2019},
}

@inproceedings{7812,
  abstract     = {Deep neural networks (DNNs) continue to make significant advances, solving tasks from image classification to translation or reinforcement learning. One aspect of the field receiving considerable attention is efficiently executing deep models in resource-constrained environments, such as mobile or embedded devices. This paper focuses on this problem, and proposes two new compression methods, which jointly leverage weight quantization and distillation of larger teacher networks into smaller student networks. The first method we propose is called quantized distillation and leverages distillation during the training process, by incorporating distillation loss, expressed with respect to the teacher, into the training of a student network whose weights are quantized to a limited set of levels. The second method,  differentiable quantization, optimizes the location of quantization points through stochastic gradient descent, to better fit the behavior of the teacher model.  We validate both methods through experiments on convolutional and recurrent architectures. We show that quantized shallow students can reach similar accuracy levels to full-precision teacher models, while providing order of magnitude compression, and inference speedup that is linear in the depth reduction. In sum, our results enable DNNs for resource-constrained environments to leverage architecture and accuracy advances developed on more powerful devices.},
  author       = {Polino, Antonio and Pascanu, Razvan and Alistarh, Dan-Adrian},
  booktitle    = {6th International Conference on Learning Representations},
  location     = {Vancouver, Canada},
  title        = {{Model compression via distillation and quantization}},
  year         = {2018},
}

@inproceedings{5961,
  abstract     = {The area of machine learning has made considerable progress over the past decade, enabled by the widespread availability of large datasets, as well as by improved algorithms and models. Given the large computational demands of machine learning workloads, parallelism, implemented either through single-node concurrency or through multi-node distribution, has been a third key ingredient to advances in machine learning.
The goal of this tutorial is to provide the audience with an overview of standard distribution techniques in machine learning, with an eye towards the intriguing trade-offs between synchronization and communication costs of distributed machine learning algorithms, on the one hand, and their convergence, on the other.The tutorial will focus on parallelization strategies for the fundamental stochastic gradient descent (SGD) algorithm, which is a key tool when training machine learning models, from classical instances such as linear regression, to state-of-the-art neural network architectures.
The tutorial will describe the guarantees provided by this algorithm in the sequential case, and then move on to cover both shared-memory and message-passing parallelization strategies, together with the guarantees they provide, and corresponding trade-offs. The presentation will conclude with a broad overview of ongoing research in distributed and concurrent machine learning. The tutorial will assume no prior knowledge beyond familiarity with basic concepts in algebra and analysis.
},
  author       = {Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing  - PODC '18},
  isbn         = {9781450357951},
  location     = {Egham, United Kingdom},
  pages        = {487--488},
  publisher    = {ACM},
  title        = {{A brief tutorial on distributed and concurrent machine learning}},
  doi          = {10.1145/3212734.3212798},
  year         = {2018},
}

@inproceedings{5962,
  abstract     = {Stochastic Gradient Descent (SGD) is a fundamental algorithm in machine learning, representing the optimization backbone for training several classic models, from regression to neural networks. Given the recent practical focus on distributed machine learning, significant work has been dedicated to the convergence properties of this algorithm under the inconsistent and noisy updates arising from execution in a distributed environment. However, surprisingly, the convergence properties of this classic algorithm in the standard shared-memory model are still not well-understood. In this work, we address this gap, and provide new convergence bounds for lock-free concurrent stochastic gradient descent, executing in the classic asynchronous shared memory model, against a strong adaptive adversary. Our results give improved upper and lower bounds on the "price of asynchrony'' when executing the fundamental SGD algorithm in a concurrent setting. They show that this classic optimization tool can converge faster and with a wider range of parameters than previously known under asynchronous iterations. At the same time, we exhibit a fundamental trade-off between the maximum delay in the system and the rate at which SGD can converge, which governs the set of parameters under which this algorithm can still work efficiently.},
  author       = {Alistarh, Dan-Adrian and De Sa, Christopher and Konstantinov, Nikola H},
  booktitle    = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing  - PODC '18},
  isbn         = {9781450357951},
  location     = {Egham, United Kingdom},
  pages        = {169--178},
  publisher    = {ACM},
  title        = {{The convergence of stochastic gradient descent in asynchronous shared memory}},
  doi          = {10.1145/3212734.3212763},
  year         = {2018},
}

@inproceedings{5963,
  abstract     = {There has been significant progress in understanding the parallelism inherent to iterative sequential algorithms: for many classic algorithms, the depth of the dependence structure is now well understood, and scheduling techniques have been developed to exploit this shallow dependence structure for efficient parallel implementations. A related, applied research strand has studied methods by which certain iterative task-based algorithms can be efficiently parallelized via relaxed concurrent priority schedulers. These allow for high concurrency when inserting and removing tasks, at the cost of executing superfluous work due to the relaxed semantics of the scheduler. In this work, we take a step towards unifying these two research directions, by showing that there exists a family of relaxed priority schedulers that can efficiently and deterministically execute classic iterative algorithms such as greedy maximal independent set (MIS) and matching. Our primary result shows that, given a randomized scheduler with an expected relaxation factor of k in terms of the maximum allowed priority inversions on a task, and any graph on n vertices, the scheduler is able to execute greedy MIS with only an additive factor of \poly(k) expected additional iterations compared to an exact (but not scalable) scheduler. This counter-intuitive result demonstrates that the overhead of relaxation when computing MIS is not dependent on the input size or structure of the input graph. Experimental results show that this overhead can be clearly offset by the gain in performance due to the highly scalable scheduler. In sum, we present an efficient method to deterministically parallelize iterative sequential algorithms, with provable runtime guarantees in terms of the number of executed tasks to completion.},
  author       = {Alistarh, Dan-Adrian and Brown, Trevor A and Kopinsky, Justin and Nadiradze, Giorgi},
  booktitle    = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing  - PODC '18},
  isbn         = {9781450357951},
  location     = {Egham, United Kingdom},
  pages        = {377--386},
  publisher    = {ACM},
  title        = {{Relaxed schedulers can efficiently parallelize iterative algorithms}},
  doi          = {10.1145/3212734.3212756},
  year         = {2018},
}

@inproceedings{5964,
  abstract     = {A standard design pattern found in many concurrent data structures, such as hash tables or ordered containers, is an alternation of parallelizable sections that incur no data conflicts and critical sections that must run sequentially and are protected with locks. A lock can be viewed as a queue that arbitrates the order in which the critical sections are executed, and a natural question is whether we can use stochastic analysis to predict the resulting throughput. As a preliminary evidence to the affirmative, we describe a simple model that can be used to predict the throughput of coarse-grained lock-based algorithms. We show that our model works well for CLH lock, and we expect it to work for other popular lock designs such as TTAS, MCS, etc.},
  author       = {Aksenov, Vitaly and Alistarh, Dan-Adrian and Kuznetsov, Petr},
  booktitle    = {Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing  - PODC '18},
  isbn         = {9781450357951},
  location     = {Egham, United Kingdom},
  pages        = {411--413},
  publisher    = {ACM},
  title        = {{Brief Announcement: Performance prediction for coarse-grained locking}},
  doi          = {10.1145/3212734.3212785},
  year         = {2018},
}

@inproceedings{5966,
  abstract     = {The transactional conflict problem arises in transactional systems whenever two or more concurrent transactions clash on a data item. While the standard solution to such conflicts is to immediately abort one of the transactions, some practical systems consider the alternative of delaying conflict resolution for a short interval, which may allow one of the transactions to commit. The challenge in the transactional conflict problem is to choose the optimal length of this delay interval so as to minimize the overall running time penalty for the conflicting transactions. In this paper, we propose a family of optimal online algorithms for the transactional conflict problem. Specifically, we consider variants of this problem which arise in different implementations of transactional systems, namely "requestor wins'' and "requestor aborts'' implementations: in the former, the recipient of a coherence request is aborted, whereas in the latter, it is the requestor which has to abort. Both strategies are implemented by real systems. We show that the requestor aborts case can be reduced to a classic instance of the ski rental problem, while the requestor wins case leads to a new version of this classical problem, for which we derive optimal deterministic and randomized algorithms. Moreover, we prove that, under a simplified adversarial model, our algorithms are constant-competitive with the offline optimum in terms of throughput. We validate our algorithmic results empirically through a hardware simulation of hardware transactional memory (HTM), showing that our algorithms can lead to non-trivial performance improvements for classic concurrent data structures.},
  author       = {Alistarh, Dan-Adrian and Haider, Syed Kamran and Kübler, Raphael and Nadiradze, Giorgi},
  booktitle    = {Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures  - SPAA '18},
  isbn         = {9781450357999},
  location     = {Vienna, Austria},
  pages        = {383--392},
  publisher    = {ACM},
  title        = {{The transactional conflict problem}},
  doi          = {10.1145/3210377.3210406},
  year         = {2018},
}

@article{6001,
  abstract     = {The concurrent memory reclamation problem is that of devising a way for a deallocating thread to verify that no other concurrent threads hold references to a memory block being deallocated. To date, in the absence of automatic garbage collection, there is no satisfactory solution to this problem; existing tracking methods like hazard pointers, reference counters, or epoch-based techniques like RCU are either prohibitively expensive or require significant programming expertise to the extent that implementing them efficiently can be worthy of a publication. None of the existing techniques are automatic or even semi-automated.
In this article, we take a new approach to concurrent memory reclamation. Instead of manually tracking access to memory locations as done in techniques like hazard pointers, or restricting shared accesses to specific epoch boundaries as in RCU, our algorithm, called ThreadScan, leverages operating system signaling to automatically detect which memory locations are being accessed by concurrent threads.
Initial empirical evidence shows that ThreadScan scales surprisingly well and requires negligible programming effort beyond the standard use of Malloc and Free.},
  author       = {Alistarh, Dan-Adrian and Leiserson, William and Matveev, Alexander and Shavit, Nir},
  issn         = {2329-4949},
  journal      = {ACM Transactions on Parallel Computing},
  number       = {4},
  publisher    = {Association for Computing Machinery},
  title        = {{ThreadScan: Automatic and scalable memory reclamation}},
  doi          = {10.1145/3201897},
  volume       = {4},
  year         = {2018},
}

@inproceedings{6031,
  abstract     = {We introduce Clover, a new library for efficient computation using low-precision data, providing mathematical routines required by fundamental methods in optimization and sparse recovery. Our library faithfully implements variants of stochastic quantization that guarantee convergence at low precision, and supports data formats from 4-bit quantized to 32-bit IEEE-754 on current Intel processors. In particular, we show that 4-bit can be implemented efficiently using Intel AVX despite the lack of native support for this data format. Experimental results with dot product, matrix-vector multiplication (MVM), gradient descent (GD), and iterative hard thresholding (IHT) demonstrate that the attainable speedups are in many cases close to linear with respect to the reduction of precision due to reduced data movement. Finally, for GD and IHT, we show examples of absolute speedup achieved by 4-bit versus 32-bit, by iterating until a given target error is achieved.},
  author       = {Stojanov, Alen and Smith, Tyler Michael and Alistarh, Dan-Adrian and Puschel, Markus},
  booktitle    = {2018 IEEE International Workshop on Signal Processing Systems},
  location     = {Cape Town, South Africa},
  publisher    = {IEEE},
  title        = {{Fast quantized arithmetic on x86: Trading compute for data movement}},
  doi          = {10.1109/SiPS.2018.8598402},
  volume       = {2018-October},
  year         = {2018},
}

@inproceedings{6589,
  abstract     = {Distributed training of massive machine learning models, in particular deep neural networks, via Stochastic Gradient Descent (SGD) is becoming commonplace. Several families of communication-reduction methods, such as quantization, large-batch methods, and gradient sparsification, have been proposed. To date, gradient sparsification methods--where each node sorts gradients by magnitude, and only communicates a subset of the components, accumulating the rest locally--are known to yield some of the largest practical gains. Such methods can reduce the amount of communication per step by up to \emph{three orders of magnitude}, while preserving model accuracy. Yet, this family of methods currently has no theoretical justification. This is the question we address in this paper. We prove that, under analytic assumptions, sparsifying gradients by magnitude with local error correction provides convergence guarantees, for both convex and non-convex smooth objectives, for data-parallel SGD. The main insight is that sparsification methods implicitly maintain bounds on the maximum impact of stale updates, thanks to selection by magnitude. Our analysis and empirical validation also reveal that these methods do require analytical conditions to converge well, justifying existing heuristics.},
  author       = {Alistarh, Dan-Adrian and Hoefler, Torsten and Johansson, Mikael and Konstantinov, Nikola H and Khirirat, Sarit and Renggli, Cedric},
  booktitle    = {Advances in Neural Information Processing Systems 31},
  location     = {Montreal, Canada},
  pages        = {5973--5983},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{The convergence of sparsified gradient methods}},
  volume       = {Volume 2018},
  year         = {2018},
}

@inproceedings{7116,
  abstract     = {Training deep learning models has received tremendous research interest recently. In particular, there has been intensive research on reducing the communication cost of training when using multiple computational devices, through reducing the precision of the underlying data representation. Naturally, such methods induce system trade-offs—lowering communication precision could de-crease communication overheads and improve scalability; but, on the other hand, it can also reduce the accuracy of training. In this paper, we study this trade-off space, and ask:Can low-precision communication consistently improve the end-to-end performance of training modern neural networks, with no accuracy loss?From the performance point of view, the answer to this question may appear deceptively easy: compressing communication through low precision should help when the ratio between communication and computation is high. However, this answer is less straightforward when we try to generalize this principle across various neural network architectures (e.g., AlexNet vs. ResNet),number of GPUs (e.g., 2 vs. 8 GPUs), machine configurations(e.g., EC2 instances vs. NVIDIA DGX-1), communication primitives (e.g., MPI vs. NCCL), and even different GPU architectures(e.g., Kepler vs. Pascal). Currently, it is not clear how a realistic realization of all these factors maps to the speed up provided by low-precision communication. In this paper, we conduct an empirical study to answer this question and report the insights.},
  author       = {Grubic, Demjan and Tam, Leo and Alistarh, Dan-Adrian and Zhang, Ce},
  booktitle    = {Proceedings of the 21st International Conference on Extending Database Technology},
  isbn         = {9783893180783},
  issn         = {2367-2005},
  location     = {Vienna, Austria},
  pages        = {145--156},
  publisher    = {OpenProceedings},
  title        = {{Synchronous multi-GPU training for deep learning with low-precision communications: An empirical study}},
  doi          = {10.5441/002/EDBT.2018.14},
  year         = {2018},
}

@inproceedings{7123,
  abstract     = {Population protocols are a popular model of distributed computing, in which n agents with limited local state interact randomly, and cooperate to collectively compute global predicates. Inspired by recent developments in DNA programming, an extensive series of papers, across different communities, has examined the computability and complexity characteristics of this model. Majority, or consensus, is a central task in this model, in which agents need to collectively reach a decision as to which one of two states A or B had a higher initial count. Two metrics are important: the time that a protocol requires to stabilize to an output decision, and the state space size that each agent requires to do so. It is known that majority requires Ω(log log n) states per agent to allow for fast (poly-logarithmic time) stabilization, and that O(log2 n) states are sufficient. Thus, there is an exponential gap between the space upper and lower bounds for this problem. This paper addresses this question.

On the negative side, we provide a new lower bound of Ω(log n) states for any protocol which stabilizes in O(n1–c) expected time, for any constant c > 0. This result is conditional on monotonicity and output assumptions, satisfied by all known protocols. Technically, it represents a departure from previous lower bounds, in that it does not rely on the existence of dense configurations. Instead, we introduce a new generalized surgery technique to prove the existence of incorrect executions for any algorithm which would contradict the lower bound. Subsequently, our lower bound also applies to general initial configurations, including ones with a leader. On the positive side, we give a new algorithm for majority which uses O(log n) states, and stabilizes in O(log2 n) expected time. Central to the algorithm is a new leaderless phase clock technique, which allows agents to synchronize in phases of Θ(n log n) consecutive interactions using O(log n) states per agent, exploiting a new connection between population protocols and power-of-two-choices load balancing mechanisms. We also employ our phase clock to build a leader election algorithm with a state space of size O(log n), which stabilizes in O(log2 n) expected time.},
  author       = {Alistarh, Dan-Adrian and Aspnes, James and Gelashvili, Rati},
  booktitle    = {Proceedings of the 29th Annual ACM-SIAM Symposium on Discrete Algorithms},
  isbn         = {9781611975031},
  location     = {New Orleans, LA, United States},
  pages        = {2221--2239},
  publisher    = {ACM},
  title        = {{Space-optimal majority in population protocols}},
  doi          = {10.1137/1.9781611975031.144},
  year         = {2018},
}

@article{76,
  abstract     = {Consider a fully-connected synchronous distributed system consisting of n nodes, where up to f nodes may be faulty and every node starts in an arbitrary initial state. In the synchronous C-counting problem, all nodes need to eventually agree on a counter that is increased by one modulo C in each round for given C&gt;1. In the self-stabilising firing squad problem, the task is to eventually guarantee that all non-faulty nodes have simultaneous responses to external inputs: if a subset of the correct nodes receive an external “go” signal as input, then all correct nodes should agree on a round (in the not-too-distant future) in which to jointly output a “fire” signal. Moreover, no node should generate a “fire” signal without some correct node having previously received a “go” signal as input. We present a framework reducing both tasks to binary consensus at very small cost. For example, we obtain a deterministic algorithm for self-stabilising Byzantine firing squads with optimal resilience f&lt;n/3, asymptotically optimal stabilisation and response time O(f), and message size O(log f). As our framework does not restrict the type of consensus routines used, we also obtain efficient randomised solutions.},
  author       = {Lenzen, Christoph and Rybicki, Joel},
  journal      = {Distributed Computing},
  publisher    = {Springer},
  title        = {{Near-optimal self-stabilising counting and firing squads}},
  doi          = {10.1007/s00446-018-0342-6},
  year         = {2018},
}

@inproceedings{397,
  abstract     = {Concurrent sets with range query operations are highly desirable in applications such as in-memory databases. However, few set implementations offer range queries. Known techniques for augmenting data structures with range queries (or operations that can be used to build range queries) have numerous problems that limit their usefulness. For example, they impose high overhead or rely heavily on garbage collection. In this work, we show how to augment data structures with highly efficient range queries, without relying on garbage collection. We identify a property of epoch-based memory reclamation algorithms that makes them ideal for implementing range queries, and produce three algorithms, which use locks, transactional memory and lock-free techniques, respectively. Our algorithms are applicable to more data structures than previous work, and are shown to be highly efficient on a large scale Intel system. },
  author       = {Arbel Raviv, Maya and Brown, Trevor A},
  isbn         = {978-1-4503-4982-6},
  location     = {Vienna, Austria},
  number       = {1},
  pages        = {14 -- 27},
  publisher    = {ACM},
  title        = {{Harnessing epoch-based reclamation for efficient range queries}},
  doi          = {10.1145/3178487.3178489},
  volume       = {53},
  year         = {2018},
}