@inproceedings{18115,
  abstract     = {We study the data selection problem, whose aim is to select a small representative subset of data that can be used to efficiently train a machine learning model. We present a new data selection approach based on k-means clustering and sensitivity sampling. Assuming access to an embedding representation of the data with respect to which the model loss is Holder continuous, our approach provably allows selecting a set of “typical” k+1/ε2 elements whose average loss corresponds to the average loss of the whole dataset, up to a multiplicative (1±ε)
 factor and an additive ελΦk, where Φk represents the k-means cost for the input embeddings and λ is the Holder constant. We furthermore demonstrate the performance and scalability of our approach on fine-tuning foundation models and show that it outperforms state-of-the-art methods. We also show how it can be applied on linear regression, leading to a new sampling strategy that surprisingly matches the performance of leverage score sampling, while being conceptually simpler and more scalable.},
  author       = {Axiotis, Kyriakos and Cohen-Addad, Vincent and Henzinger, Monika H and Jerome, Sammy and Mirrokni, Vahab and Saulpic, David and Woodruff, David P. and Wunder, Michael},
  booktitle    = {Proceedings of the 41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {2086--2107},
  publisher    = {ML Research Press},
  title        = {{Data-efficient learning via clustering-based sensitivity sampling: Foundation models and beyond}},
  volume       = {235},
  year         = {2024},
}