@phdthesis{21198,
  abstract     = {In recent years there has been a massive increase in the amount of data generated in a
decentralized manner. Ever more powerful edge devices, such as smartphones, have become
ubiquitous in most societies on earth. Through text typed, photos taken and apps used,
these devices, which we refer to as clients, generate enormous amounts of high quality and
complex data. Moreover, the nature of these devices means the data they generate is often
sensitive and privacy concerns prevent it being gathered and stored in a central location. This
presents a challenge to the modern machine learning paradigm that requires central access
to large amounts of data. Federated learning (FL) has emerged as one of the answers to
this problem. Rather than bringing the data to the model, FL sends the model to the data.
Model training takes place on device, with periodically synchronized updates, allowing data to
remain locally stored. While this approach offers significant privacy advantages it comes with
its own set of unique challenges. These include: data heterogeneity, the notion that different
devices generate data in distinct ways which can negatively impact training dynamics; systems
heterogeneity, meaning that different devices may have differing hardware specifications; high
communication costs, which are induced by the repeated transferring of models over the
network and low device computational power, which limits the use of larger models on device.
In this thesis we present a range of methods for federated learning. We focus primarily on
the challenge of data heterogeneity, though the methods presented are designed to be well
adapted to the other challenges of a federated setting, such as the constraints of limited
compute and communication overhead. We first present a method for explicitly modeling client
data heterogeneity. The approach formulates clients as samples from a certain probability
distribution and infers the parameters of this distribution from the available training clients.
This learned distribution then represents the heterogeneity present among the clients and can
be sampled from in order to create new simulated clients that are similar to the real clients we
have observed so far. Following this we present two methods for directly dealing with data
heterogeneity through personalization. Highly heterogeneous client data distributions can mean
that learning a single global model becomes suboptimal, and some form of personalization of
models to each individual client is required. Our approaches are based around hypernetworks,
which we use to generate personalized model parameters without the need for additional
training or finetuning. In the first approach we focus on generating full parameterizations of
client models using learned embeddings of client data and labels, with a hypernetwork located
on the central server. In the second approach we address the more challenging scenario where
we want to generate a personalized model for a client without any label information. The
hypernetwork is trained to generate a low dimensional representation of a client’s personalized
model parameters, allowing it to be transferred to and run on the client devices. In our final
presented method, we change our focus and rather than aim to directly address the challenge
of data heterogeneity, we instead ensure we are unaffected by it. This is done in the context
of k-means clustering and we present a method for federated clustering with a focus on added
privacy guarantees.},
  author       = {Scott, Jonathan A},
  issn         = {2663-337X},
  pages        = {158},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Data heterogeneity and personalization in federated learning}},
  doi          = {10.15479/AT-ISTA-21198},
  year         = {2026},
}

@inproceedings{20819,
  abstract     = {Clustering is a cornerstone of data analysis that is particularly suited to identifying coherent subgroups or substructures in unlabeled data, as are generated continuously in large amounts these days. However, in many cases traditional clustering methods are not applicable, because data are increasingly being produced and stored in a distributed way, e.g. on edge devices, and privacy concerns prevent it from being transferred to a central server. To address this challenge, we present FedDP-KMeans, a new algorithm for 
-means clustering that is fully-federated as well as differentially private. Our approach leverages (potentially small and out-of-distribution) server-side data to overcome the primary challenge of differentially private clustering methods: the need for a good initialization. Combining our initialization with a simple federated DP-Lloyds algorithm we obtain an algorithm that achieves excellent results on synthetic and real-world benchmark tasks. We also provide a theoretical analysis of our method that provides bounds on the convergence speed and cluster identification success.},
  author       = {Scott, Jonathan A and Lampert, Christoph and Saulpic, David},
  booktitle    = {42nd International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vancouver, Canada},
  pages        = {53757--53790},
  publisher    = {ML Research Press},
  title        = {{Differentially private federated k-means clustering with server-side data}},
  volume       = {267},
  year         = {2025},
}

@unpublished{21207,
  abstract     = {Personalized federated learning has emerged as a popular approach to training on devices holding statistically heterogeneous data, known as clients. However, most existing approaches require a client to have labeled data for training or finetuning in order to obtain their own personalized model. In this paper we address this by proposing FLowDUP, a novel method that is able to generate a personalized model using only a forward pass with unlabeled data. The generated model parameters reside in a low-dimensional subspace, enabling efficient communication and computation. FLowDUP's learning objective is theoretically motivated by our new transductive multi-task PAC-Bayesian generalization bound, that provides performance guarantees for unlabeled clients. The objective is structured in such a way that it allows both clients with labeled data and clients with only unlabeled data to contribute to the training process. To supplement our theoretical results we carry out a thorough experimental evaluation of FLowDUP, demonstrating strong empirical performance on a range of datasets with differing sorts of statistically heterogeneous clients. Through numerous ablation studies, we test the efficacy of the individual components of the method.},
  author       = {Zakerinia, Hossein and Scott, Jonathan A and Lampert, Christoph},
  booktitle    = {arXiv},
  title        = {{Federated learning with unlabeled clients: Personalization can happen in low dimensions}},
  doi          = {10.48550/ARXIV.2505.15579},
  year         = {2025},
}

@inproceedings{17411,
  abstract     = {We present PeFLL, a new personalized federated learning algorithm that improves
over the state-of-the-art in three aspects: 1) it produces more accurate models,
especially in the low-data regime, and not only for clients present during its
training phase, but also for any that may emerge in the future; 2) it reduces the
amount of on-client computation and client-server communication by providing
future clients with ready-to-use personalized models that require no additional
finetuning or optimization; 3) it comes with theoretical guarantees that establish
generalization from the observed clients to future ones.
At the core of PeFLL lies a learning-to-learn approach that jointly trains an
embedding network and a hypernetwork. The embedding network is used to
represent clients in a latent descriptor space in a way that reflects their similarity
to each other. The hypernetwork takes as input such descriptors and outputs the
parameters of fully personalized client models. In combination, both networks
constitute a learning algorithm that achieves state-of-the-art performance in several
personalized federated learning benchmarks},
  author       = {Scott, Jonathan A and Zakerinia, Hossein and Lampert, Christoph},
  booktitle    = {12th International Conference on Learning Representations},
  location     = {Vienna, Austria},
  publisher    = {OpenReview},
  title        = {{PEFLL: Personalized federated learning by learning to learn}},
  year         = {2024},
}

@inproceedings{18120,
  abstract     = {In practice, training using federated learning can be orders of magnitude slower than standard centralized training. This severely limits the amount of experimentation and tuning that can be done, making it challenging to obtain good performance on a given task. Server-side proxy data can be used to run training simulations, for instance for hyperparameter tuning. This can greatly speed up the training pipeline by reducing the number of tuning runs to be performed overall on the true clients. However, it is challenging to ensure that these simulations accurately reflect the dynamics of the real federated training. In particular, the proxy data used for simulations often comes as a single centralized dataset without a partition into distinct clients, and partitioning this data in a naive way can lead to simulations that poorly reflect real federated training. In this paper we address the challenge of how to partition centralized data in a way that reflects the statistical heterogeneity of the true federated clients. We propose a fully federated, theoretically justified, algorithm that efficiently learns the distribution of the true clients and observe improved server-side simulations when using the inferred distribution to create simulated clients from the centralized data.},
  author       = {Scott, Jonathan A and Cahill, Áine},
  booktitle    = {Proceedings of the 41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {44012--44037},
  publisher    = {ML Research Press},
  title        = {{Improved modelling of federated datasets using mixtures-of-Dirichlet-multinomials}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{12660,
  abstract     = {We present Cross-Client Label Propagation(XCLP), a new method for transductive federated learning. XCLP estimates a data graph jointly from the data of multiple clients and computes labels for the unlabeled data by propagating label information across the graph. To avoid clients having to share their data with anyone, XCLP employs two cryptographically secure protocols: secure Hamming distance computation and secure summation. We demonstrate two distinct applications of XCLP within federated learning. In the first, we use it in a one-shot way to predict labels for unseen test points. In the second, we use it to repeatedly pseudo-label unlabeled training data in a federated semi-supervised setting. Experiments on both real federated and standard benchmark datasets show that in both applications XCLP achieves higher classification accuracy than alternative approaches.},
  author       = {Scott, Jonathan A and Yeo, Michelle X and Lampert, Christoph},
  booktitle    = {Transactions in Machine Learning},
  issn         = {2835-8856},
  publisher    = {Curran Associates},
  title        = {{Cross-client label propagation for transductive and semi-supervised federated learning}},
  year         = {2023},
}

