@inproceedings{12780,
  abstract     = {The ability to scale out training workloads has been one of the key performance enablers of deep learning. The main scaling approach is data-parallel GPU-based training, which has been boosted by hardware and software support for highly efficient point-to-point communication, and in particular via hardware bandwidth over-provisioning. Overprovisioning comes at a cost: there is an order of magnitude price difference between "cloud-grade" servers with such support, relative to their popular "consumer-grade" counterparts, although single server-grade and consumer-grade GPUs can have similar computational envelopes.

In this paper, we show that the costly hardware overprovisioning approach can be supplanted via algorithmic and system design, and propose a framework called CGX, which provides efficient software support for compressed communication in ML applications, for both multi-GPU single-node training, as well as larger-scale multi-node training. CGX is based on two technical advances: At the system level, it relies on a re-developed communication stack for ML frameworks, which provides flexible, highly-efficient support for compressed communication. At the application level, it provides seamless, parameter-free integration with popular frameworks, so that end-users do not have to modify training recipes, nor significant training code. This is complemented by a layer-wise adaptive compression technique which dynamically balances compression gains with accuracy preservation. CGX integrates with popular ML frameworks, providing up to 3X speedups for multi-GPU nodes based on commodity hardware, and order-of-magnitude improvements in the multi-node setting, with negligible impact on accuracy.},
  author       = {Markov, Ilia and Ramezanikebrya, Hamidreza and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 23rd ACM/IFIP International Middleware Conference},
  isbn         = {9781450393409},
  location     = {Quebec, QC, Canada},
  pages        = {241--254},
  publisher    = {Association for Computing Machinery},
  title        = {{CGX: Adaptive system support for communication-efficient deep learning}},
  doi          = {10.1145/3528535.3565248},
  year         = {2022},
}