@phdthesis{19759,
  abstract     = {Despite generating remarkable results in various computer vision tasks, deep learning comes
with some surprising shortcomings. For example, tiny perturbations, often imperceptible to
the human eye, can completely change the predictions of image classifiers. Despite a decade
of research, the field has made limited progress in developing image classifiers that are both
accurate and robust. This thesis aims to address this gap.
As our first contribution, we aim to simplify the process of training certifiably robust image
classifiers. We do this by designing a convolutional layer that does not require executing an
iterative procedure in every forward pass, but relies on an explicit bound instead. We also
propose a loss function that allows optimizing for a particular margin more precisely.
Next, we provide an overview and comparison of various methods that create robust image
classifiers by constraining the Lipschitz constant. This is important since generally longer
training times and more parameters improve the performance of robust classifiers, making it
challenging to determine the most practical and effective methods from existing literature.
In 1-Lipschitz classification, the performance of current methods is still much worse than what
we expect on the simple tasks we consider. Therefore, we next investigate potential causes of
this shortcoming. We first consider the role of the activation function. We prove a theoretical
shortcoming of the commonly used activation function, and provide an alternative without it.
However this theoretical improvement does barely translate to the empirical performance of
robust classifiers, suggesting a different bottleneck.
Therefore, in the final chapter, we study how the performance depends on the amount of
training data. We prove that in the worst case, we might require far more data to train a
robust classifier compared to a normal one. We furthermore find that the amount of training
data is a key determinant of the performance current methods achieve on popular datasets.
Additionally, we show that linear subspaces exist with tiny data variance, and yet we can
still train very accurate classifiers after projecting into those subspaces. This shows that on
the datasets considered, enforcing robustness in classification makes the task strictly more
challenging.

-----------------“In reference to IEEE copyrighted material which is used with permission in this thesis, the IEEE does not endorse any of [name of university or educational entity]’s products or services. Internal or personal use of this material is permitted. If interested in reprinting/republishing IEEE copyrighted material for advertising or promotional purposes or for creating new collective works for resale or redistribution, please go to http://www.ieee.org/publications_standards/publications/rights/rights_link.html to learn how to obtain a License from RightsLink. If applicable, University Microfilms and/or ProQuest Library, or the Archives of Canada may supply single copies of the dissertation.”
},
  author       = {Prach, Bernd},
  issn         = {2663-337X},
  pages        = {84},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Robust image classification with 1-Lipschitz networks}},
  doi          = {10.15479/10.15479/at-ista-19759},
  year         = {2025},
}

@inproceedings{20455,
  abstract     = {Despite extensive research since the community learned about adversarial examples 10 years ago, we still do not know how to train high-accuracy classifiers that are guaranteed to be robust to small perturbations of their inputs. Previous works often argued that this might be because no classifier exists that is robust and accurate at the same time. However, in computer vision this assumption does not match reality where humans are usually accurate and robust on most tasks of interest. We offer an alternative explanation and show that in certain settings robust generalization is only possible with unrealistically large amounts of data. Specifically, we find a setting where a robust classifier exists, it is easy to learn an accurate classifier, yet it requires an exponential amount of data to learn a robust classifier. Based on this theoretical result, we evaluate the influence of the amount of training data on datasets such as CIFAR10. Our findings indicate that the the amount of training data is the main factor determining the robust performance. Furthermore we show that that there are low magnitude directions in the data which are useful for non-robust generalization but are not available for robust classifiers. This implies that robust classification is a strictly harder tasks than normal classification, thereby providing an explanation why robust classification requires more data.},
  author       = {Prach, Bernd and Lampert, Christoph},
  booktitle    = {2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops},
  isbn         = {9798331599942},
  issn         = {2160-7516},
  location     = {Nashville, TN, United States},
  pages        = {660--669},
  publisher    = {IEEE},
  title        = {{Intriguing properties of robust classification}},
  doi          = {10.1109/CVPRW67362.2025.00071},
  year         = {2025},
}

@unpublished{18874,
  abstract     = {Despite extensive research since the community learned about adversarial
examples 10 years ago, we still do not know how to train high-accuracy
classifiers that are guaranteed to be robust to small perturbations of their
inputs. Previous works often argued that this might be because no classifier
exists that is robust and accurate at the same time. However, in computer
vision this assumption does not match reality where humans are usually accurate
and robust on most tasks of interest. We offer an alternative explanation and
show that in certain settings robust generalization is only possible with
unrealistically large amounts of data. More precisely we find a setting where a
robust classifier exists, it is easy to learn an accurate classifier, yet it
requires an exponential amount of data to learn a robust classifier. Based on
this theoretical result, we explore how well robust classifiers generalize on
datasets such as CIFAR-10. We come to the conclusion that on this datasets, the
limitation of current robust models also lies in the generalization, and that
they require a lot of data to do well on the test set. We also show that the
problem is not in the expressiveness or generalization capabilities of current
architectures, and that there are low magnitude features in the data which are
useful for non-robust generalization but are not available for robust
classifiers.},
  author       = {Prach, Bernd and Lampert, Christoph},
  booktitle    = {arXiv},
  title        = {{Intriguing properties of robust classification}},
  doi          = {10.48550/arXiv.2412.04245},
  year         = {2024},
}

@inproceedings{17426,
  abstract     = {The robustness of neural networks against input perturbations with bounded
magnitude represents a serious concern in the deployment of deep learning
models in safety-critical systems. Recently, the scientific community has
focused on enhancing certifiable robustness guarantees by crafting 1-Lipschitz
neural networks that leverage Lipschitz bounded dense and convolutional layers.
Although different methods have been proposed in the literature to achieve this
goal, understanding the performance of such methods is not straightforward,
since different metrics can be relevant (e.g., training time, memory usage,
accuracy, certifiable robustness) for different applications. For this reason,
this work provides a thorough theoretical and empirical comparison between
methods by evaluating them in terms of memory usage, speed, and certifiable
robust accuracy. The paper also provides some guidelines and recommendations to
support the user in selecting the methods that work best depending on the
available resources. We provide code at
https://github.com/berndprach/1LipschitzLayersCompared.},
  author       = {Prach, Bernd and Brau, Fabio and Buttazzo, Giorgio and Lampert, Christoph},
  booktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  location     = {Seattle, WA, United States},
  pages        = {24574--24583},
  publisher    = {Computer Vision Foundation},
  title        = {{1-Lipschitz layers compared: Memory, speed, and certifiable robustness}},
  doi          = {10.1109/CVPR52733.2024.02320},
  year         = {2024},
}

@unpublished{15039,
  abstract     = {A crucial property for achieving secure, trustworthy and interpretable deep learning systems is their robustness: small changes to a system's inputs should not result in large changes to its outputs. Mathematically, this means one strives for networks with a small Lipschitz constant. Several recent works have focused on how to construct such Lipschitz networks, typically by imposing constraints on the weight matrices. In this work, we study an orthogonal aspect, namely the role of the activation function. We show that commonly used activation functions, such as MaxMin, as well as all piece-wise linear ones with two segments unnecessarily restrict the class of representable functions, even in the simplest one-dimensional setting. We furthermore introduce the new N-activation function that is provably more expressive than currently popular activation functions. We provide code at this https URL.},
  author       = {Prach, Bernd and Lampert, Christoph},
  booktitle    = {arXiv},
  title        = {{1-Lipschitz neural networks are more expressive with N-activations}},
  doi          = {10.48550/ARXIV.2311.06103},
  year         = {2023},
}

@inproceedings{11839,
  abstract     = {It is a highly desirable property for deep networks to be robust against
small input changes. One popular way to achieve this property is by designing
networks with a small Lipschitz constant. In this work, we propose a new
technique for constructing such Lipschitz networks that has a number of
desirable properties: it can be applied to any linear network layer
(fully-connected or convolutional), it provides formal guarantees on the
Lipschitz constant, it is easy to implement and efficient to run, and it can be
combined with any training objective and optimization method. In fact, our
technique is the first one in the literature that achieves all of these
properties simultaneously. Our main contribution is a rescaling-based weight
matrix parametrization that guarantees each network layer to have a Lipschitz
constant of at most 1 and results in the learned weight matrices to be close to
orthogonal. Hence we call such layers almost-orthogonal Lipschitz (AOL).
Experiments and ablation studies in the context of image classification with
certified robust accuracy confirm that AOL layers achieve results that are on
par with most existing methods. Yet, they are simpler to implement and more
broadly applicable, because they do not require computationally expensive
matrix orthogonalization or inversion steps as part of the network
architecture. We provide code at https://github.com/berndprach/AOL.},
  author       = {Prach, Bernd and Lampert, Christoph},
  booktitle    = {Computer Vision – ECCV 2022},
  isbn         = {9783031198021},
  location     = {Tel Aviv, Israel},
  pages        = {350--365},
  publisher    = {Springer Nature},
  title        = {{Almost-orthogonal layers for efficient general-purpose Lipschitz networks}},
  doi          = {10.1007/978-3-031-19803-8_21},
  volume       = {13681},
  year         = {2022},
}