@article{12662,
  abstract     = {Modern machine learning tasks often require considering not just one but multiple objectives. For example, besides the prediction quality, this could be the efficiency, robustness or fairness of the learned models, or any of their combinations. Multi-objective learning offers a natural framework for handling such problems without having to commit to early trade-offs. Surprisingly, statistical learning theory so far offers almost no insight into the generalization properties of multi-objective learning. In this work, we make first steps to fill this gap: We establish foundational generalization bounds for the multi-objective setting as well as generalization and excess bounds for learning with scalarizations. We also provide the first theoretical analysis of the relation between the Pareto-optimal sets of the true objectives and the Pareto-optimal sets of their empirical approximations from training data. In particular, we show a surprising asymmetry: All Pareto-optimal solutions can be approximated by empirically Pareto-optimal ones, but not vice versa.},
  author       = {Súkeník, Peter and Lampert, Christoph},
  issn         = {1433-3058},
  journal      = {Neural Computing and Applications},
  pages        = {24669–24683},
  publisher    = {Springer Nature},
  title        = {{Generalization in multi-objective machine learning}},
  doi          = {10.1007/s00521-024-10616-1},
  volume       = {37},
  year         = {2025},
}

@inproceedings{20035,
  abstract     = {Deep neural networks (DNNs) at convergence consistently represent the training data in the last layer via a geometric structure referred to as neural collapse. This empirical evidence has spurred a line of theoretical research aimed at proving the emergence of neural collapse, mostly focusing on the unconstrained features model. Here, the features of the penultimate layer are free variables, which makes the model data-agnostic and puts into question its ability to capture DNN training. Our work addresses the issue, moving away from unconstrained features and
studying DNNs that end with at least two linear layers. We first prove generic guarantees on neural collapse that assume (i) low training error and balancedness of linear layers (for within-class variability collapse), and (ii) bounded conditioning of the features before the linear part (for orthogonality of class-means, and their alignment with weight matrices). The balancedness refers to the fact that W⊤ℓ+1Wℓ+1 ≈ WℓW⊤ℓfor any pair of consecutive weight matrices of the linear part, and the bounded conditioning requires a well-behaved ratio between largest and smallest non-zero singular values of the features. We then show that such assumptions hold for gradient descent training with weight decay: (i) for networks with a wide first layer, we prove low training error and balancedness, and (ii) for solutions that are either nearly optimal or stable under large learning rates, we additionally prove the bounded conditioning. Taken together, our results are the first to show neural collapse in the end-to-end training of DNNs.},
  author       = {Jacot, Arthur and Súkeník, Peter and Wang, Zihan and Mondelli, Marco},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {1905--1931},
  publisher    = {ICLR},
  title        = {{Wide neural networks trained with weight decay provably exhibit neural collapse}},
  year         = {2025},
}

@inproceedings{18890,
  abstract     = {Deep Neural Collapse (DNC) refers to the surprisingly rigid structure of the data representations in the final layers of Deep Neural Networks (DNNs). Though the phenomenon has been measured in a variety of settings, its emergence is typically explained via data-agnostic approaches, such as the unconstrained features model. In this work, we introduce a data-dependent setting where DNC forms due to feature learning through the average gradient outer product (AGOP). The AGOP is defined with respect to a learned predictor and is equal to the uncentered covariance matrix of its input-output gradients averaged over the training dataset. The Deep Recursive Feature Machine (Deep RFM) is a method that constructs a neural network by iteratively mapping the data with the AGOP and applying an untrained random feature map. We demonstrate empirically that DNC occurs in Deep RFM across standard settings as a consequence of the projection with the AGOP matrix computed at each layer. Further, we theoretically explain DNC in Deep RFM in an asymptotic setting and as a result of kernel learning. We then provide evidence that this mechanism holds for neural networks more generally. In particular, we show that the right singular vectors and values of the weights can be responsible for the majority of within-class variability collapse for DNNs trained in the feature learning regime. As observed in recent work, this singular structure is highly correlated with that of the AGOP.},
  author       = {Beaglehole, Daniel and Súkeník, Peter and Mondelli, Marco and Belkin, Mikhail},
  booktitle    = {38th Annual Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Average gradient outer product as a mechanism for deep neural collapse}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{18891,
  abstract     = {Deep neural networks (DNNs) exhibit a surprising structure in their final layer
known as neural collapse (NC), and a growing body of works has currently investigated the propagation of neural collapse to earlier layers of DNNs – a phenomenon
called deep neural collapse (DNC). However, existing theoretical results are restricted to special cases: linear models, only two layers or binary classification.
In contrast, we focus on non-linear models of arbitrary depth in multi-class classification and reveal a surprising qualitative shift. As soon as we go beyond two
layers or two classes, DNC stops being optimal for the deep unconstrained features
model (DUFM) – the standard theoretical framework for the analysis of collapse.
The main culprit is a low-rank bias of multi-layer regularization schemes: this bias
leads to optimal solutions of even lower rank than the neural collapse. We support
our theoretical findings with experiments on both DUFM and real data, which show
the emergence of the low-rank structure in the solution found by gradient descent.},
  author       = {Súkeník, Peter and Lampert, Christoph and Mondelli, Marco},
  booktitle    = {38th Annual Conference on Neural Information Processing Systems},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Neural collapse versus low-rank bias: Is deep neural collapse really optimal?}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{14921,
  abstract     = {Neural collapse (NC) refers to the surprising structure of the last layer of deep neural networks in the terminal phase of gradient descent training. Recently, an increasing amount of experimental evidence has pointed to the propagation of NC to earlier layers of neural networks. However, while the NC in the last layer is well studied theoretically, much less is known about its multi-layered counterpart - deep neural collapse (DNC). In particular, existing work focuses either on linear layers or only on the last two layers at the price of an extra assumption. Our paper fills this gap by generalizing the established analytical framework for NC - the unconstrained features model - to multiple non-linear layers. Our key technical contribution is to show that, in a deep unconstrained features model, the unique global optimum for binary classification exhibits all the properties typical of DNC. This explains the existing experimental evidence of DNC. We also empirically show that (i) by optimizing deep unconstrained features models via gradient descent, the resulting solution agrees well with our theory, and (ii) trained networks recover the unconstrained features suitable for the occurrence of DNC, thus supporting the validity of this modeling principle.},
  author       = {Súkeník, Peter and Mondelli, Marco and Lampert, Christoph},
  booktitle    = {37th Annual Conference on Neural Information Processing Systems},
  location     = {New Orleans, LA, United States},
  title        = {{Deep neural collapse is provably optimal for the deep unconstrained features model}},
  year         = {2023},
}

@inproceedings{12664,
  abstract     = {Randomized smoothing is currently considered the state-of-the-art method to obtain certifiably robust classifiers. Despite its remarkable performance, the method is associated with various serious problems such as “certified accuracy waterfalls”, certification vs. accuracy trade-off, or even fairness issues. Input-dependent smoothing approaches have been proposed with intention of overcoming these flaws. However, we demonstrate that these methods lack formal guarantees and so the resulting certificates are not justified. We show that in general, the input-dependent smoothing suffers from the curse of dimensionality, forcing the variance function to have low semi-elasticity. On the other hand, we provide a theoretical and practical framework that enables the usage of input-dependent smoothing even in the presence of the curse of dimensionality, under strict restrictions. We present one concrete design of the smoothing variance function and test it on CIFAR10 and MNIST. Our design mitigates some of the problems of classical smoothing and is formally underlined, yet further improvement of the design is still necessary.},
  author       = {Súkeník, Peter and Kuvshinov, Aleksei and Günnemann, Stephan},
  booktitle    = {Proceedings of the 39th International Conference on Machine Learning},
  location     = {Baltimore, MD, United States},
  pages        = {20697--20743},
  publisher    = {ML Research Press},
  title        = {{Intriguing properties of input-dependent randomized smoothing}},
  volume       = {162},
  year         = {2022},
}

@inproceedings{18876,
  abstract     = {Convolutional neural networks were the standard for solving many computer vision tasks until recently, when Transformers of MLP-based architectures have started to show competitive performance. These architectures typically have a vast number of weights and need to be trained on massive datasets; hence, they are not suitable for their use in low-data regimes. In this work, we propose a simple yet effective framework to improve generalization from small amounts of data. We augment modern CNNs with fully-connected (FC) layers and show the massive impact this architectural change has in low-data regimes. We further present an online joint knowledge-distillation method to utilize the extra FC layers at train time but avoid them during test time. This allows us to improve the generalization of a CNN-based model without any increase in the number of weights at test time. We perform classification experiments for a large range of network backbones and several standard datasets on supervised learning and active learning. Our experiments significantly outperform the networks without fully-connected layers, reaching a relative improvement of up to 16% validation accuracy in the supervised setting without adding any extra parameters during inference.},
  author       = {Kocsis, Peter and Súkeník, Peter and Brasó, Guillem and Niessner, Matthias and Leal-Taixé, Laura and Elezi, Ismail},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {New Orleans, LA, United States},
  pages        = {1896--1908},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{The unreasonable effectiveness of fully-connected layers for low-data regimes}},
  volume       = {35},
  year         = {2022},
}

