@phdthesis{21198,
  abstract     = {In recent years there has been a massive increase in the amount of data generated in a
decentralized manner. Ever more powerful edge devices, such as smartphones, have become
ubiquitous in most societies on earth. Through text typed, photos taken and apps used,
these devices, which we refer to as clients, generate enormous amounts of high quality and
complex data. Moreover, the nature of these devices means the data they generate is often
sensitive and privacy concerns prevent it being gathered and stored in a central location. This
presents a challenge to the modern machine learning paradigm that requires central access
to large amounts of data. Federated learning (FL) has emerged as one of the answers to
this problem. Rather than bringing the data to the model, FL sends the model to the data.
Model training takes place on device, with periodically synchronized updates, allowing data to
remain locally stored. While this approach offers significant privacy advantages it comes with
its own set of unique challenges. These include: data heterogeneity, the notion that different
devices generate data in distinct ways which can negatively impact training dynamics; systems
heterogeneity, meaning that different devices may have differing hardware specifications; high
communication costs, which are induced by the repeated transferring of models over the
network and low device computational power, which limits the use of larger models on device.
In this thesis we present a range of methods for federated learning. We focus primarily on
the challenge of data heterogeneity, though the methods presented are designed to be well
adapted to the other challenges of a federated setting, such as the constraints of limited
compute and communication overhead. We first present a method for explicitly modeling client
data heterogeneity. The approach formulates clients as samples from a certain probability
distribution and infers the parameters of this distribution from the available training clients.
This learned distribution then represents the heterogeneity present among the clients and can
be sampled from in order to create new simulated clients that are similar to the real clients we
have observed so far. Following this we present two methods for directly dealing with data
heterogeneity through personalization. Highly heterogeneous client data distributions can mean
that learning a single global model becomes suboptimal, and some form of personalization of
models to each individual client is required. Our approaches are based around hypernetworks,
which we use to generate personalized model parameters without the need for additional
training or finetuning. In the first approach we focus on generating full parameterizations of
client models using learned embeddings of client data and labels, with a hypernetwork located
on the central server. In the second approach we address the more challenging scenario where
we want to generate a personalized model for a client without any label information. The
hypernetwork is trained to generate a low dimensional representation of a client’s personalized
model parameters, allowing it to be transferred to and run on the client devices. In our final
presented method, we change our focus and rather than aim to directly address the challenge
of data heterogeneity, we instead ensure we are unaffected by it. This is done in the context
of k-means clustering and we present a method for federated clustering with a focus on added
privacy guarantees.},
  author       = {Scott, Jonathan A},
  issn         = {2663-337X},
  pages        = {158},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Data heterogeneity and personalization in federated learning}},
  doi          = {10.15479/AT-ISTA-21198},
  year         = {2026},
}

@article{12662,
  abstract     = {Modern machine learning tasks often require considering not just one but multiple objectives. For example, besides the prediction quality, this could be the efficiency, robustness or fairness of the learned models, or any of their combinations. Multi-objective learning offers a natural framework for handling such problems without having to commit to early trade-offs. Surprisingly, statistical learning theory so far offers almost no insight into the generalization properties of multi-objective learning. In this work, we make first steps to fill this gap: We establish foundational generalization bounds for the multi-objective setting as well as generalization and excess bounds for learning with scalarizations. We also provide the first theoretical analysis of the relation between the Pareto-optimal sets of the true objectives and the Pareto-optimal sets of their empirical approximations from training data. In particular, we show a surprising asymmetry: All Pareto-optimal solutions can be approximated by empirically Pareto-optimal ones, but not vice versa.},
  author       = {Súkeník, Peter and Lampert, Christoph},
  issn         = {1433-3058},
  journal      = {Neural Computing and Applications},
  pages        = {24669–24683},
  publisher    = {Springer Nature},
  title        = {{Generalization in multi-objective machine learning}},
  doi          = {10.1007/s00521-024-10616-1},
  volume       = {37},
  year         = {2025},
}

@inproceedings{20256,
  abstract     = {We study the problem of predictive runtime monitoring of black-box dynamical systems with quantitative safety properties. The black-box setting stipulates that the exact semantics of the dynamical system and the controller are unknown, and that we are only able to observe the state of the controlled (aka, closed-loop) system at finitely many time points. We present a novel framework for predicting future states of the system based on the states observed in the past. The numbers of past states and of predicted future states are parameters provided by the user. Our method is based on a combination of Taylor’s expansion and the backward difference operator for numerical differentiation. We also derive an upper bound on the prediction error under the assumption that the system dynamics and the controller are smooth. The predicted states are then used to predict safety violations ahead in time. Our experiments demonstrate practical applicability of our method for complex black-box systems, showing that it is computationally lightweight and yet significantly more accurate than the state-of-the-art predictive safety monitoring techniques.},
  author       = {Henzinger, Thomas A and Kresse, Fabian and Mallik, Kaushik and Yu, Zhengqi and Zikelic, Dorde},
  booktitle    = {7th Annual Learning for Dynamics & Control Conference},
  issn         = {2640-3498},
  location     = {Ann Arbor, MI, United States},
  pages        = {804--816},
  publisher    = {ML Research Press},
  title        = {{Predictive monitoring of black-box dynamical systems}},
  volume       = {283},
  year         = {2025},
}

@inproceedings{20296,
  abstract     = {Learning-based systems are increasingly deployed across various domains, yet the complexity of traditional neural networks poses significant challenges for formal verification. Unlike conventional neural networks, learned Logic Gate Networks (LGNs) replace multiplications with Boolean logic gates, yielding a sparse, netlist-like architecture that is inherently more amenable to symbolic verification, while still delivering promising performance. In this paper, we introduce a SAT encoding for verifying global robustness and fairness in LGNs. We evaluate our method on five benchmark datasets, including a newly constructed 5-class variant, and find that LGNs are both verification-friendly and maintain strong predictive performance.},
  author       = {Kresse, Fabian and Yu, Zhengqi and Lampert, Christoph and Henzinger, Thomas A},
  booktitle    = {2nd International Conferenceon Neuro-Symbolic Systems},
  issn         = {2640-3498},
  location     = {Philadephia, PA, United States},
  publisher    = {ML Research Press},
  title        = {{Logic gate neural networks are good for verification}},
  volume       = {288},
  year         = {2025},
}

@inproceedings{20298,
  abstract     = {In this paper, we study the problem of estimating the unknown mean θ of a unit variance Gaussian distribution in a locally differentially private (LDP) way. In the high-privacy regime (ϵ≤1
), we identify an optimal privacy mechanism that minimizes the variance of the estimator asymptotically. Our main technical contribution is the maximization of the Fisher-Information of the sanitized data with respect to the local privacy mechanism Q. We find that the exact solution Qθ,ϵ of this maximization is the sign mechanism that applies randomized response to the sign of Xi−θ, where X1,…,Xn are the confidential iid original samples. However, since this optimal local mechanism depends on the unknown mean θ, we employ a two-stage LDP parameter estimation procedure which requires splitting agents into two groups. The first n1 observations are used to consistently but not necessarily efficiently estimate the parameter θ by θn1~
. Then this estimate is updated by applying the sign mechanism with θ~n1 instead of θ
 to the remaining n−n1 observations, to obtain an LDP and efficient estimator of the unknown mean.},
  author       = {Kalinin, Nikita and Steinberger, Lukas},
  booktitle    = {Proceedings of the 28th International Conference on Artificial Intelligence and Statistics},
  issn         = {2640-3498},
  location     = {Mai Khao, Thailand},
  pages        = {118--126},
  publisher    = {ML Research Press},
  title        = {{Efficient estimation of a Gaussian mean with local differential privacy}},
  volume       = {258},
  year         = {2025},
}

@inproceedings{20455,
  abstract     = {Despite extensive research since the community learned about adversarial examples 10 years ago, we still do not know how to train high-accuracy classifiers that are guaranteed to be robust to small perturbations of their inputs. Previous works often argued that this might be because no classifier exists that is robust and accurate at the same time. However, in computer vision this assumption does not match reality where humans are usually accurate and robust on most tasks of interest. We offer an alternative explanation and show that in certain settings robust generalization is only possible with unrealistically large amounts of data. Specifically, we find a setting where a robust classifier exists, it is easy to learn an accurate classifier, yet it requires an exponential amount of data to learn a robust classifier. Based on this theoretical result, we evaluate the influence of the amount of training data on datasets such as CIFAR10. Our findings indicate that the the amount of training data is the main factor determining the robust performance. Furthermore we show that that there are low magnitude directions in the data which are useful for non-robust generalization but are not available for robust classifiers. This implies that robust classification is a strictly harder tasks than normal classification, thereby providing an explanation why robust classification requires more data.},
  author       = {Prach, Bernd and Lampert, Christoph},
  booktitle    = {2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops},
  isbn         = {9798331599942},
  issn         = {2160-7516},
  location     = {Nashville, TN, United States},
  pages        = {660--669},
  publisher    = {IEEE},
  title        = {{Intriguing properties of robust classification}},
  doi          = {10.1109/CVPRW67362.2025.00071},
  year         = {2025},
}

@inproceedings{20819,
  abstract     = {Clustering is a cornerstone of data analysis that is particularly suited to identifying coherent subgroups or substructures in unlabeled data, as are generated continuously in large amounts these days. However, in many cases traditional clustering methods are not applicable, because data are increasingly being produced and stored in a distributed way, e.g. on edge devices, and privacy concerns prevent it from being transferred to a central server. To address this challenge, we present FedDP-KMeans, a new algorithm for 
-means clustering that is fully-federated as well as differentially private. Our approach leverages (potentially small and out-of-distribution) server-side data to overcome the primary challenge of differentially private clustering methods: the need for a good initialization. Combining our initialization with a simple federated DP-Lloyds algorithm we obtain an algorithm that achieves excellent results on synthetic and real-world benchmark tasks. We also provide a theoretical analysis of our method that provides bounds on the convergence speed and cluster identification success.},
  author       = {Scott, Jonathan A and Lampert, Christoph and Saulpic, David},
  booktitle    = {42nd International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vancouver, Canada},
  pages        = {53757--53790},
  publisher    = {ML Research Press},
  title        = {{Differentially private federated k-means clustering with server-side data}},
  volume       = {267},
  year         = {2025},
}

@unpublished{21207,
  abstract     = {Personalized federated learning has emerged as a popular approach to training on devices holding statistically heterogeneous data, known as clients. However, most existing approaches require a client to have labeled data for training or finetuning in order to obtain their own personalized model. In this paper we address this by proposing FLowDUP, a novel method that is able to generate a personalized model using only a forward pass with unlabeled data. The generated model parameters reside in a low-dimensional subspace, enabling efficient communication and computation. FLowDUP's learning objective is theoretically motivated by our new transductive multi-task PAC-Bayesian generalization bound, that provides performance guarantees for unlabeled clients. The objective is structured in such a way that it allows both clients with labeled data and clients with only unlabeled data to contribute to the training process. To supplement our theoretical results we carry out a thorough experimental evaluation of FLowDUP, demonstrating strong empirical performance on a range of datasets with differing sorts of statistically heterogeneous clients. Through numerous ablation studies, we test the efficacy of the individual components of the method.},
  author       = {Zakerinia, Hossein and Scott, Jonathan A and Lampert, Christoph},
  booktitle    = {arXiv},
  title        = {{Federated learning with unlabeled clients: Personalization can happen in low dimensions}},
  doi          = {10.48550/ARXIV.2505.15579},
  year         = {2025},
}

@phdthesis{19759,
  abstract     = {Despite generating remarkable results in various computer vision tasks, deep learning comes
with some surprising shortcomings. For example, tiny perturbations, often imperceptible to
the human eye, can completely change the predictions of image classifiers. Despite a decade
of research, the field has made limited progress in developing image classifiers that are both
accurate and robust. This thesis aims to address this gap.
As our first contribution, we aim to simplify the process of training certifiably robust image
classifiers. We do this by designing a convolutional layer that does not require executing an
iterative procedure in every forward pass, but relies on an explicit bound instead. We also
propose a loss function that allows optimizing for a particular margin more precisely.
Next, we provide an overview and comparison of various methods that create robust image
classifiers by constraining the Lipschitz constant. This is important since generally longer
training times and more parameters improve the performance of robust classifiers, making it
challenging to determine the most practical and effective methods from existing literature.
In 1-Lipschitz classification, the performance of current methods is still much worse than what
we expect on the simple tasks we consider. Therefore, we next investigate potential causes of
this shortcoming. We first consider the role of the activation function. We prove a theoretical
shortcoming of the commonly used activation function, and provide an alternative without it.
However this theoretical improvement does barely translate to the empirical performance of
robust classifiers, suggesting a different bottleneck.
Therefore, in the final chapter, we study how the performance depends on the amount of
training data. We prove that in the worst case, we might require far more data to train a
robust classifier compared to a normal one. We furthermore find that the amount of training
data is a key determinant of the performance current methods achieve on popular datasets.
Additionally, we show that linear subspaces exist with tiny data variance, and yet we can
still train very accurate classifiers after projecting into those subspaces. This shows that on
the datasets considered, enforcing robustness in classification makes the task strictly more
challenging.

-----------------“In reference to IEEE copyrighted material which is used with permission in this thesis, the IEEE does not endorse any of [name of university or educational entity]’s products or services. Internal or personal use of this material is permitted. If interested in reprinting/republishing IEEE copyrighted material for advertising or promotional purposes or for creating new collective works for resale or redistribution, please go to http://www.ieee.org/publications_standards/publications/rights/rights_link.html to learn how to obtain a License from RightsLink. If applicable, University Microfilms and/or ProQuest Library, or the Archives of Canada may supply single copies of the dissertation.”
},
  author       = {Prach, Bernd},
  issn         = {2663-337X},
  pages        = {84},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Robust image classification with 1-Lipschitz networks}},
  doi          = {10.15479/10.15479/at-ista-19759},
  year         = {2025},
}

@inproceedings{18118,
  abstract     = {We introduce a new framework for studying meta-learning methods using PAC-Bayesian theory. Its main advantage over previous work is that it allows for more flexibility in how the transfer of knowledge between tasks is realized. For previous approaches, this could only happen indirectly, by means of learning prior distributions over models. In contrast, the new generalization bounds that we prove express the process of meta-learning much more directly as learning the learning algorithm that should be used for future tasks. The flexibility of our framework makes it suitable to analyze a wide range of meta-learning mechanisms and even design new mechanisms. Other than our theoretical contributions we also show empirically that our framework improves the prediction quality in practical meta-learning mechanisms.},
  author       = {Zakerinia, Hossein and Behjati, Amin and Lampert, Christoph},
  booktitle    = {Proceedings of the 41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {58122--58139},
  publisher    = {ML Research Press},
  title        = {{More flexible PAC-Bayesian meta-learning by learning learning algorithms}},
  volume       = {235},
  year         = {2024},
}

@article{18856,
  abstract     = {This research is aimed to solve the tweet/user geolocation prediction task and provide a flexible methodology for the geo-tagging of textual big data. The suggested approach implements neural networks for natural language processing (NLP) to estimate the location as coordinate pairs (longitude, latitude) and two-dimensional Gaussian Mixture Models (GMMs). The scope of proposed models has been finetuned on a Twitter dataset using pretrained Bidirectional Encoder Representations from Transformers (BERT) as base models. Performance metrics show a median error of fewer than 30 km on a worldwide-level, and fewer than 15 km on the US-level datasets for the models trained and evaluated on text features of tweets' content and metadata context. Our source code and data are available at https://github.com/K4TEL/geo-twitter.git.},
  author       = {Lutsai, Kateryna and Lampert, Christoph},
  issn         = {1948-660X},
  journal      = {Journal of Spatial Information Science},
  number       = {29},
  pages        = {69--99},
  publisher    = {University of Maine},
  title        = {{Predicting the geolocation of tweets using transformer models on customized data}},
  doi          = {10.5311/JOSIS.2024.29.295},
  year         = {2024},
}

@inproceedings{18875,
  abstract     = {Current state-of-the-art methods for differentially private model training are based on matrix factorization techniques. However, these methods suffer from high computational overhead because they require numerically solving a demanding optimization problem to determine an approximately optimal factorization prior to the actual model training. In this work, we present a new matrix factorization approach, BSR, which overcomes this computational bottleneck. By exploiting properties of the standard matrix square root, BSR allows to efficiently handle also large-scale problems. For the key scenario of stochastic gradient descent with momentum and weight decay, we even derive analytical expressions for BSR that render the computational overhead negligible. We prove bounds on the approximation quality that hold both in the centralized and in the federated learning setting. Our numerical experiments demonstrate that models trained using BSR perform on par with the best existing methods, while completely avoiding their computational overhead.},
  author       = {Kalinin, Nikita and Lampert, Christoph},
  booktitle    = {38th Annual Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Banded square root matrix factorization for differentially private model training}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{18891,
  abstract     = {Deep neural networks (DNNs) exhibit a surprising structure in their final layer
known as neural collapse (NC), and a growing body of works has currently investigated the propagation of neural collapse to earlier layers of DNNs – a phenomenon
called deep neural collapse (DNC). However, existing theoretical results are restricted to special cases: linear models, only two layers or binary classification.
In contrast, we focus on non-linear models of arbitrary depth in multi-class classification and reveal a surprising qualitative shift. As soon as we go beyond two
layers or two classes, DNC stops being optimal for the deep unconstrained features
model (DUFM) – the standard theoretical framework for the analysis of collapse.
The main culprit is a low-rank bias of multi-layer regularization schemes: this bias
leads to optimal solutions of even lower rank than the neural collapse. We support
our theoretical findings with experiments on both DUFM and real data, which show
the emergence of the low-rank structure in the solution found by gradient descent.},
  author       = {Súkeník, Peter and Lampert, Christoph and Mondelli, Marco},
  booktitle    = {38th Annual Conference on Neural Information Processing Systems},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Neural collapse versus low-rank bias: Is deep neural collapse really optimal?}},
  volume       = {37},
  year         = {2024},
}

@unpublished{19063,
  abstract     = {Instruction-tuned Large Language Models (LLMs) show impressive results in numerous practical applications, but they lack essential safety features that are common in other areas of computer science, particularly an explicit separation of instructions and data. This makes them vulnerable to manipulations such as indirect prompt injections and generally unsuitable for safety-critical tasks. Surprisingly, there is currently no established definition or benchmark to quantify this phenomenon. In this work, we close this gap by introducing a formal measure for instruction-data separation and an empirical variant that is calculable from a model's outputs. We also present a new dataset, SEP, that allows estimating the measure for real-world models. Our results on various LLMs show that the problem of instruction-data separation is real: all models fail to achieve high separation, and canonical mitigation techniques, such as prompt engineering and fine-tuning, either fail to substantially improve separation or reduce model utility. The source code and SEP dataset are openly accessible at https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed.
},
  author       = {Zverev, Egor and Abdelnabi, Sahar and Tabesh, Soroush and Fritz, Mario and Lampert, Christoph},
  booktitle    = {arXiv},
  title        = {{Can LLMs separate instructions from data? And what do we even mean by that?}},
  doi          = {10.48550/arXiv.2403.06833},
  year         = {2024},
}

@article{19408,
  abstract     = {Continual learning is a subfield of machine learning, which aims to allow machine learning models to continuously learn on new data, by accumulating knowledge without forgetting what was learned in the past. In this work, we take a step back, and ask: "Why should one care about continual learning in the first place?". We set the stage by examining recent continual learning papers published at four major machine learning conferences, and show that memory-constrained settings dominate the field. Then, we discuss five open problems in machine learning, and even though they might seem unrelated to continual learning at first sight, we show that continual learning will inevitably be part of their solution. These problems are model editing, personalization and specialization, on-device learning, faster (re-)training and reinforcement learning. Finally, by comparing the desiderata from these unsolved problems and the current assumptions in continual learning, we highlight and discuss four future directions for continual learning research. We hope that this work offers an interesting perspective on the future of continual learning, while displaying its potential value and the paths we have to pursue in order to make it successful. This work is the result of the many discussions the authors had at the Dagstuhl seminar on Deep Continual Learning, in March 2023.},
  author       = {Verwimp, Eli and Aljundi, Rahaf and Ben-David, Shai and Bethge, Matthias and Cossu, Andrea and Gepperth, Alexander and Hayes, Tyler L. and Hüllermeier, Eyke and Kanan, Christopher and Kudithipudi, Dhireesha and Lampert, Christoph and Mundt, Martin and Pascanu, Razvan and Popescu, Adrian and Tolias, Andreas S. and Van De Weijer, Joost and Liu, Bing and Lomonaco, Vincenzo and Tuytelaars, Tinne and Van De Ven, Gido M.},
  issn         = {2835-8856},
  journal      = {Transactions on Machine Learning Research},
  publisher    = {Transactions on Machine Learning Research},
  title        = {{Continual learning: Applications and the road forward}},
  volume       = {2024},
  year         = {2024},
}

@inproceedings{17093,
  abstract     = {Federated Learning (FL) enables large-scale distributed training of machine learning models, while still allowing individual nodes to maintain data locally. However, executing FL at scale comes with inherent practical challenges: 1) heterogeneity of the local node data distributions, 2) heterogeneity of node computational speeds (asynchrony), but also 3) constraints in the amount of communication between the clients and the server. In this work, we present the first variant of the classic federated averaging (FedAvg) algorithm which, at the same time, supports data heterogeneity, partial client asynchrony, and communication compression. Our algorithm comes with a novel, rigorous analysis showing that, in spite of these system relaxations, it can provide similar convergence to FedAvg in interesting parameter regimes. Experimental results in the rigorous LEAF benchmark on setups of up to 300 nodes show that our algorithm ensures fast convergence for standard federated tasks, improving upon prior quantized and asynchronous approaches.},
  author       = {Zakerinia, Hossein and Talaei, Shayan and Nadiradze, Giorgi and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of the 27th International Conference on Artificial Intelligence and Statistics},
  issn         = {2640-3498},
  location     = {Valencia, Spain},
  pages        = {3448--3456},
  publisher    = {ML Research Press},
  title        = {{Communication-efficient federated learning with data and client heterogeneity}},
  volume       = {238},
  year         = {2024},
}

@inproceedings{17411,
  abstract     = {We present PeFLL, a new personalized federated learning algorithm that improves
over the state-of-the-art in three aspects: 1) it produces more accurate models,
especially in the low-data regime, and not only for clients present during its
training phase, but also for any that may emerge in the future; 2) it reduces the
amount of on-client computation and client-server communication by providing
future clients with ready-to-use personalized models that require no additional
finetuning or optimization; 3) it comes with theoretical guarantees that establish
generalization from the observed clients to future ones.
At the core of PeFLL lies a learning-to-learn approach that jointly trains an
embedding network and a hypernetwork. The embedding network is used to
represent clients in a latent descriptor space in a way that reflects their similarity
to each other. The hypernetwork takes as input such descriptors and outputs the
parameters of fully personalized client models. In combination, both networks
constitute a learning algorithm that achieves state-of-the-art performance in several
personalized federated learning benchmarks},
  author       = {Scott, Jonathan A and Zakerinia, Hossein and Lampert, Christoph},
  booktitle    = {12th International Conference on Learning Representations},
  location     = {Vienna, Austria},
  publisher    = {OpenReview},
  title        = {{PEFLL: Personalized federated learning by learning to learn}},
  year         = {2024},
}

@inproceedings{18120,
  abstract     = {In practice, training using federated learning can be orders of magnitude slower than standard centralized training. This severely limits the amount of experimentation and tuning that can be done, making it challenging to obtain good performance on a given task. Server-side proxy data can be used to run training simulations, for instance for hyperparameter tuning. This can greatly speed up the training pipeline by reducing the number of tuning runs to be performed overall on the true clients. However, it is challenging to ensure that these simulations accurately reflect the dynamics of the real federated training. In particular, the proxy data used for simulations often comes as a single centralized dataset without a partition into distinct clients, and partitioning this data in a naive way can lead to simulations that poorly reflect real federated training. In this paper we address the challenge of how to partition centralized data in a way that reflects the statistical heterogeneity of the true federated clients. We propose a fully federated, theoretically justified, algorithm that efficiently learns the distribution of the true clients and observe improved server-side simulations when using the inferred distribution to create simulated clients from the centralized data.},
  author       = {Scott, Jonathan A and Cahill, Áine},
  booktitle    = {Proceedings of the 41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {44012--44037},
  publisher    = {ML Research Press},
  title        = {{Improved modelling of federated datasets using mixtures-of-Dirichlet-multinomials}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{17426,
  abstract     = {The robustness of neural networks against input perturbations with bounded
magnitude represents a serious concern in the deployment of deep learning
models in safety-critical systems. Recently, the scientific community has
focused on enhancing certifiable robustness guarantees by crafting 1-Lipschitz
neural networks that leverage Lipschitz bounded dense and convolutional layers.
Although different methods have been proposed in the literature to achieve this
goal, understanding the performance of such methods is not straightforward,
since different metrics can be relevant (e.g., training time, memory usage,
accuracy, certifiable robustness) for different applications. For this reason,
this work provides a thorough theoretical and empirical comparison between
methods by evaluating them in terms of memory usage, speed, and certifiable
robust accuracy. The paper also provides some guidelines and recommendations to
support the user in selecting the methods that work best depending on the
available resources. We provide code at
https://github.com/berndprach/1LipschitzLayersCompared.},
  author       = {Prach, Bernd and Brau, Fabio and Buttazzo, Giorgio and Lampert, Christoph},
  booktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  location     = {Seattle, WA, United States},
  pages        = {24574--24583},
  publisher    = {Computer Vision Foundation},
  title        = {{1-Lipschitz layers compared: Memory, speed, and certifiable robustness}},
  doi          = {10.1109/CVPR52733.2024.02320},
  year         = {2024},
}

@unpublished{18874,
  abstract     = {Despite extensive research since the community learned about adversarial
examples 10 years ago, we still do not know how to train high-accuracy
classifiers that are guaranteed to be robust to small perturbations of their
inputs. Previous works often argued that this might be because no classifier
exists that is robust and accurate at the same time. However, in computer
vision this assumption does not match reality where humans are usually accurate
and robust on most tasks of interest. We offer an alternative explanation and
show that in certain settings robust generalization is only possible with
unrealistically large amounts of data. More precisely we find a setting where a
robust classifier exists, it is easy to learn an accurate classifier, yet it
requires an exponential amount of data to learn a robust classifier. Based on
this theoretical result, we explore how well robust classifiers generalize on
datasets such as CIFAR-10. We come to the conclusion that on this datasets, the
limitation of current robust models also lies in the generalization, and that
they require a lot of data to do well on the test set. We also show that the
problem is not in the expressiveness or generalization capabilities of current
architectures, and that there are low magnitude features in the data which are
useful for non-robust generalization but are not available for robust
classifiers.},
  author       = {Prach, Bernd and Lampert, Christoph},
  booktitle    = {arXiv},
  title        = {{Intriguing properties of robust classification}},
  doi          = {10.48550/arXiv.2412.04245},
  year         = {2024},
}